[llvm] c722575 - [ARM] Select VINS from vector inserts

David Green via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 2 05:50:15 PST 2021


Author: David Green
Date: 2021-02-02T13:50:02Z
New Revision: c72257563347b611733b6744cefc0bf49bab4f8d

URL: https://github.com/llvm/llvm-project/commit/c72257563347b611733b6744cefc0bf49bab4f8d
DIFF: https://github.com/llvm/llvm-project/commit/c72257563347b611733b6744cefc0bf49bab4f8d.diff

LOG: [ARM] Select VINS from vector inserts

This patch adds tablegen patterns for pairs of i16/f16 insert/extracts.
If we are inserting into two adjacent vector lanes (0 and 1 for
example), we can use either a vmov;vins or vmovx;vins to insert the pair
together, avoiding a round-trip from GRP registers. This is quite a
large patterns with a number of EXTRACT_SUBREG/INSERT_SUBREG/
COPY_TO_REGCLASS nodes, but hopefully as most of those become copies all
that will be cleaned up by further optimizations.

The VINS pattern was also adjusted to allow it to represent that it is
inserting into the top half of an existing register.

Differential Revision: https://reviews.llvm.org/D95381

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMInstrMVE.td
    llvm/lib/Target/ARM/ARMInstrVFP.td
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
    llvm/test/CodeGen/Thumb2/mve-div-expand.ll
    llvm/test/CodeGen/Thumb2/mve-fmas.ll
    llvm/test/CodeGen/Thumb2/mve-fmath.ll
    llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll
    llvm/test/CodeGen/Thumb2/mve-frint.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
    llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
    llvm/test/CodeGen/Thumb2/mve-minmax.ll
    llvm/test/CodeGen/Thumb2/mve-shuffle.ll
    llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
    llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
    llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
    llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
    llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
    llvm/test/CodeGen/Thumb2/mve-vcvt.ll
    llvm/test/CodeGen/Thumb2/mve-vld2.ll
    llvm/test/CodeGen/Thumb2/mve-vld3.ll
    llvm/test/CodeGen/Thumb2/mve-vld4.ll
    llvm/test/CodeGen/Thumb2/mve-vldst4.ll
    llvm/test/CodeGen/Thumb2/mve-vmovn.ll
    llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
    llvm/test/CodeGen/Thumb2/mve-vst2.ll
    llvm/test/CodeGen/Thumb2/mve-vst3.ll
    llvm/test/CodeGen/Thumb2/mve-vst4.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 0dfea68887e5..3b554804d639 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -1833,6 +1833,13 @@ def MVE_VMOV_from_lane_s8  : MVE_VMOV_lane_8 < "s8", 0b0, MVE_VMOV_from_lane>;
 def MVE_VMOV_from_lane_u8  : MVE_VMOV_lane_8 < "u8", 0b1, MVE_VMOV_from_lane>;
 def MVE_VMOV_to_lane_8     : MVE_VMOV_lane_8 <  "8", 0b0, MVE_VMOV_to_lane>;
 
+// This is the same as insertelt but allows the inserted value to be an i32 as
+// will be used when it is the only legal type.
+def ARMVecInsert : SDTypeProfile<1, 3, [
+  SDTCisVT<2, i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<3>
+]>;
+def ARMinsertelt  : SDNode<"ISD::INSERT_VECTOR_ELT", ARMVecInsert>;
+
 let Predicates = [HasMVEInt] in {
   def : Pat<(extractelt (v2f64 MQPR:$src), imm:$lane),
             (f64 (EXTRACT_SUBREG MQPR:$src, (DSubReg_f64_reg imm:$lane)))>;
@@ -1893,6 +1900,44 @@ let Predicates = [HasMVEInt] in {
             (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), (f16 HPR:$src), ssub_0)>;
   def : Pat<(v8f16 (scalar_to_vector GPR:$src)),
             (MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
+
+  foreach LANE = [0, 2, 4, 6] in {
+    defvar SSUB = !cast<SubRegIndex>("ssub_"#!srl(LANE, 1));
+
+    // v8f16 pattern for inserting two lanes using a VINS
+    def : Pat<(insertelt (insertelt (v8f16 MQPR:$srcV), (f16 HPR:$src1), LANE),
+                         (f16 HPR:$src2), !add(LANE,1)),
+              (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)),
+                                  (VINSH (COPY_TO_REGCLASS HPR:$src1, SPR),
+                                         (COPY_TO_REGCLASS HPR:$src2, SPR)),
+                                 SSUB), MQPR)>;
+
+    // v8i16 pattern for extracting 2 even lane elements and inserting them using a VINS
+    def : Pat<(ARMinsertelt (ARMinsertelt (v8i16 MQPR:$srcV),
+                                          (ARMvgetlaneu (v8i16 MQPR:$src1), imm_even:$lane1),
+                                          LANE),
+                           (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$lane2),
+                           !add(LANE,1)),
+              (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)),
+                                  (VINSH (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src1, MQPR)),
+                                                         (SSubReg_f16_reg imm_even:$lane1)),
+                                         (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src2, MQPR)),
+                                                         (SSubReg_f16_reg imm_even:$lane2))),
+                                 SSUB), MQPR)>;
+
+    // v8i16 pattern for extracting an element using VMOVX and inserting another using a VINS
+    def : Pat<(ARMinsertelt (ARMinsertelt (v8i16 MQPR:$srcV),
+                                          (ARMvgetlaneu (v8i16 MQPR:$src1), imm_odd:$lane1),
+                                          LANE),
+                            (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$lane2),
+                            !add(LANE,1)),
+              (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)),
+                                  (VINSH (VMOVH (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src1, MQPR)),
+                                                                (SSubReg_f16_reg imm_odd:$lane1))),
+                                                (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src2, MQPR)),
+                                                                (SSubReg_f16_reg imm_even:$lane2))),
+                                 SSUB), MQPR)>;
+  }
 }
 
 // end of mve_bit instructions

diff  --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 2be58d7a0e62..146161801c56 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -798,6 +798,8 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  Requires<[HasFP16]>,
             Sched<[WriteFPCVT]>;
 
+// AddedComplexity to use over the dual-insert MVE pattern
+let AddedComplexity = 6 in
 def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
               (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH SPR:$src2),
                                     (SSubReg_f16_reg imm:$lane)))>;
@@ -1126,9 +1128,12 @@ def VMOVH  : ASuInp<0b11101, 0b11, 0b0000, 0b01, 0,
              Requires<[HasFullFP16]>;
 
 def VINSH  : ASuInp<0b11101, 0b11, 0b0000, 0b11, 0,
-                  (outs SPR:$Sd), (ins SPR:$Sm),
+                  (outs SPR:$Sd), (ins SPR:$Sda, SPR:$Sm),
                   IIC_fpUNA16, "vins.f16\t$Sd, $Sm", []>,
-             Requires<[HasFullFP16]>;
+             Requires<[HasFullFP16]> {
+  let Constraints = "$Sd = $Sda";
+}
+
 } // PostEncoderMethod
 } // hasSideEffects
 

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index b1a19c0991a9..f8fb8476c322 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -306,10 +306,10 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-LABEL: fast_float_half_mac:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    beq.w .LBB2_22
+; CHECK-NEXT:    beq.w .LBB2_20
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r3, r2, #3
 ; CHECK-NEXT:    vmov.i32 q5, #0x0
@@ -324,41 +324,27 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    movs r3, #0
 ; CHECK-NEXT:    vdup.32 q1, r12
 ; CHECK-NEXT:    vdup.32 q2, r12
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    b .LBB2_4
-; CHECK-NEXT:  .LBB2_2: @ %cond.load25
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vmovx.f16 s0, s28
-; CHECK-NEXT:    vmov r4, s28
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q6[0], r4
-; CHECK-NEXT:    vldr.16 s0, [r1, #6]
-; CHECK-NEXT:    vmov.16 q6[1], r2
-; CHECK-NEXT:    vmov r2, s29
-; CHECK-NEXT:    vmov.16 q6[2], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q6[3], r2
-; CHECK-NEXT:  .LBB2_3: @ %else26
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vmul.f16 q0, q6, q5
+; CHECK-NEXT:    b .LBB2_3
+; CHECK-NEXT:  .LBB2_2: @ %else26
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
+; CHECK-NEXT:    vmul.f16 q5, q6, q5
 ; CHECK-NEXT:    adds r0, #8
-; CHECK-NEXT:    vcvtt.f32.f16 s23, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s27, s21
 ; CHECK-NEXT:    adds r1, #8
-; CHECK-NEXT:    vcvtb.f32.f16 s22, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s26, s21
 ; CHECK-NEXT:    adds r3, #4
-; CHECK-NEXT:    vcvtt.f32.f16 s21, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s20, s0
-; CHECK-NEXT:    vadd.f32 q5, q3, q5
+; CHECK-NEXT:    vcvtt.f32.f16 s25, s20
+; CHECK-NEXT:    vcvtb.f32.f16 s24, s20
+; CHECK-NEXT:    vadd.f32 q5, q3, q6
 ; CHECK-NEXT:    subs.w lr, lr, #1
-; CHECK-NEXT:    bne .LBB2_4
-; CHECK-NEXT:    b .LBB2_21
-; CHECK-NEXT:  .LBB2_4: @ %vector.body
+; CHECK-NEXT:    bne .LBB2_3
+; CHECK-NEXT:    b .LBB2_19
+; CHECK-NEXT:  .LBB2_3: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov q3, q5
-; CHECK-NEXT:    @ implicit-def: $q6
 ; CHECK-NEXT:    vadd.i32 q4, q0, r3
+; CHECK-NEXT:    vmov q3, q5
 ; CHECK-NEXT:    vcmp.u32 cs, q1, q4
+; CHECK-NEXT:    @ implicit-def: $q5
 ; CHECK-NEXT:    vmrs r4, p0
 ; CHECK-NEXT:    and r2, r4, #1
 ; CHECK-NEXT:    rsbs r5, r2, #0
@@ -374,72 +360,29 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    rsbs r4, r4, #0
 ; CHECK-NEXT:    bfi r2, r4, #3, #1
 ; CHECK-NEXT:    lsls r4, r2, #31
-; CHECK-NEXT:    bne .LBB2_9
-; CHECK-NEXT:  @ %bb.5: @ %else
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bne .LBB2_12
+; CHECK-NEXT:  @ %bb.4: @ %else
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
 ; CHECK-NEXT:    lsls r4, r2, #30
-; CHECK-NEXT:    bpl .LBB2_10
-; CHECK-NEXT:  .LBB2_6: @ %cond.load6
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vldr.16 s20, [r0, #2]
-; CHECK-NEXT:    vmov r5, s24
-; CHECK-NEXT:    vmovx.f16 s24, s25
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov.16 q5[0], r5
-; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov r4, s25
-; CHECK-NEXT:    vmov.16 q5[2], r4
-; CHECK-NEXT:    vmov r4, s24
-; CHECK-NEXT:    vmov.16 q5[3], r4
+; CHECK-NEXT:    bmi .LBB2_13
+; CHECK-NEXT:  .LBB2_5: @ %else7
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
 ; CHECK-NEXT:    lsls r4, r2, #29
-; CHECK-NEXT:    bmi .LBB2_11
-; CHECK-NEXT:  .LBB2_7: @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vmov q6, q5
-; CHECK-NEXT:    lsls r2, r2, #28
-; CHECK-NEXT:    bmi .LBB2_12
-; CHECK-NEXT:  .LBB2_8: @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    b .LBB2_13
-; CHECK-NEXT:  .LBB2_9: @ %cond.load
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vldr.16 s24, [r0]
-; CHECK-NEXT:    lsls r4, r2, #30
-; CHECK-NEXT:    bmi .LBB2_6
-; CHECK-NEXT:  .LBB2_10: @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    lsls r4, r2, #29
-; CHECK-NEXT:    bpl .LBB2_7
-; CHECK-NEXT:  .LBB2_11: @ %cond.load9
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vmovx.f16 s24, s20
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vldr.16 s28, [r0, #4]
-; CHECK-NEXT:    vmov r5, s24
-; CHECK-NEXT:    vmov.16 q6[0], r4
-; CHECK-NEXT:    vmovx.f16 s20, s21
-; CHECK-NEXT:    vmov.16 q6[1], r5
-; CHECK-NEXT:    vmov r4, s28
-; CHECK-NEXT:    vmov.16 q6[2], r4
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov.16 q6[3], r4
+; CHECK-NEXT:    bmi .LBB2_14
+; CHECK-NEXT:  .LBB2_6: @ %else10
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
 ; CHECK-NEXT:    lsls r2, r2, #28
 ; CHECK-NEXT:    bpl .LBB2_8
-; CHECK-NEXT:  .LBB2_12: @ %cond.load12
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vmovx.f16 s20, s24
-; CHECK-NEXT:    vmov r4, s24
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov.16 q5[0], r4
-; CHECK-NEXT:    vmov.16 q5[1], r2
-; CHECK-NEXT:    vmov r2, s25
+; CHECK-NEXT:  .LBB2_7: @ %cond.load12
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
+; CHECK-NEXT:    vmovx.f16 s24, s20
+; CHECK-NEXT:    vins.f16 s20, s24
 ; CHECK-NEXT:    vldr.16 s24, [r0, #6]
-; CHECK-NEXT:    vmov.16 q5[2], r2
-; CHECK-NEXT:    vmov r2, s24
-; CHECK-NEXT:    vmov.16 q5[3], r2
-; CHECK-NEXT:  .LBB2_13: @ %else13
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:  .LBB2_8: @ %else13
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
 ; CHECK-NEXT:    vcmp.u32 cs, q2, q4
-; CHECK-NEXT:    @ implicit-def: $q7
+; CHECK-NEXT:    @ implicit-def: $q6
 ; CHECK-NEXT:    vmrs r4, p0
 ; CHECK-NEXT:    and r2, r4, #1
 ; CHECK-NEXT:    rsbs r5, r2, #0
@@ -455,58 +398,73 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    rsbs r4, r4, #0
 ; CHECK-NEXT:    bfi r2, r4, #3, #1
 ; CHECK-NEXT:    lsls r4, r2, #31
-; CHECK-NEXT:    bne .LBB2_17
-; CHECK-NEXT:  @ %bb.14: @ %else17
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bne .LBB2_15
+; CHECK-NEXT:  @ %bb.9: @ %else17
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
+; CHECK-NEXT:    lsls r4, r2, #30
+; CHECK-NEXT:    bmi .LBB2_16
+; CHECK-NEXT:  .LBB2_10: @ %else20
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
+; CHECK-NEXT:    lsls r4, r2, #29
+; CHECK-NEXT:    bmi .LBB2_17
+; CHECK-NEXT:  .LBB2_11: @ %else23
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
+; CHECK-NEXT:    lsls r2, r2, #28
+; CHECK-NEXT:    bpl .LBB2_2
+; CHECK-NEXT:    b .LBB2_18
+; CHECK-NEXT:  .LBB2_12: @ %cond.load
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
+; CHECK-NEXT:    vldr.16 s20, [r0]
 ; CHECK-NEXT:    lsls r4, r2, #30
-; CHECK-NEXT:    bpl .LBB2_18
-; CHECK-NEXT:  .LBB2_15: @ %cond.load19
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vldr.16 s24, [r1, #2]
-; CHECK-NEXT:    vmov r5, s28
-; CHECK-NEXT:    vmovx.f16 s28, s29
-; CHECK-NEXT:    vmov r4, s24
-; CHECK-NEXT:    vmov.16 q6[0], r5
-; CHECK-NEXT:    vmov.16 q6[1], r4
-; CHECK-NEXT:    vmov r4, s29
-; CHECK-NEXT:    vmov.16 q6[2], r4
-; CHECK-NEXT:    vmov r4, s28
-; CHECK-NEXT:    vmov.16 q6[3], r4
+; CHECK-NEXT:    bpl .LBB2_5
+; CHECK-NEXT:  .LBB2_13: @ %cond.load6
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
+; CHECK-NEXT:    vldr.16 s24, [r0, #2]
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vmovx.f16 s24, s21
+; CHECK-NEXT:    vins.f16 s21, s24
 ; CHECK-NEXT:    lsls r4, r2, #29
-; CHECK-NEXT:    bmi .LBB2_19
-; CHECK-NEXT:  .LBB2_16: @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vmov q7, q6
+; CHECK-NEXT:    bpl .LBB2_6
+; CHECK-NEXT:  .LBB2_14: @ %cond.load9
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
+; CHECK-NEXT:    vmovx.f16 s24, s20
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vmovx.f16 s24, s21
+; CHECK-NEXT:    vldr.16 s21, [r0, #4]
+; CHECK-NEXT:    vins.f16 s21, s24
 ; CHECK-NEXT:    lsls r2, r2, #28
-; CHECK-NEXT:    bmi.w .LBB2_2
-; CHECK-NEXT:    b .LBB2_20
-; CHECK-NEXT:  .LBB2_17: @ %cond.load16
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vldr.16 s28, [r1]
+; CHECK-NEXT:    bmi .LBB2_7
+; CHECK-NEXT:    b .LBB2_8
+; CHECK-NEXT:  .LBB2_15: @ %cond.load16
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
+; CHECK-NEXT:    vldr.16 s24, [r1]
 ; CHECK-NEXT:    lsls r4, r2, #30
-; CHECK-NEXT:    bmi .LBB2_15
-; CHECK-NEXT:  .LBB2_18: @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vmov q6, q7
+; CHECK-NEXT:    bpl .LBB2_10
+; CHECK-NEXT:  .LBB2_16: @ %cond.load19
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
+; CHECK-NEXT:    vldr.16 s28, [r1, #2]
+; CHECK-NEXT:    vins.f16 s24, s28
+; CHECK-NEXT:    vmovx.f16 s28, s25
+; CHECK-NEXT:    vins.f16 s25, s28
 ; CHECK-NEXT:    lsls r4, r2, #29
-; CHECK-NEXT:    bpl .LBB2_16
-; CHECK-NEXT:  .LBB2_19: @ %cond.load22
-; CHECK-NEXT:    @ in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT:    bpl .LBB2_11
+; CHECK-NEXT:  .LBB2_17: @ %cond.load22
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
 ; CHECK-NEXT:    vmovx.f16 s28, s24
-; CHECK-NEXT:    vmov r4, s24
-; CHECK-NEXT:    vldr.16 s0, [r1, #4]
-; CHECK-NEXT:    vmov r5, s28
-; CHECK-NEXT:    vmov.16 q7[0], r4
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov.16 q7[1], r5
-; CHECK-NEXT:    vmovx.f16 s0, s25
-; CHECK-NEXT:    vmov.16 q7[2], r4
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov.16 q7[3], r4
+; CHECK-NEXT:    vins.f16 s24, s28
+; CHECK-NEXT:    vmovx.f16 s28, s25
+; CHECK-NEXT:    vldr.16 s25, [r1, #4]
+; CHECK-NEXT:    vins.f16 s25, s28
 ; CHECK-NEXT:    lsls r2, r2, #28
-; CHECK-NEXT:    bmi.w .LBB2_2
-; CHECK-NEXT:  .LBB2_20: @ in Loop: Header=BB2_4 Depth=1
-; CHECK-NEXT:    vmov q6, q7
-; CHECK-NEXT:    b .LBB2_3
-; CHECK-NEXT:  .LBB2_21: @ %middle.block
+; CHECK-NEXT:    bpl.w .LBB2_2
+; CHECK-NEXT:  .LBB2_18: @ %cond.load25
+; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
+; CHECK-NEXT:    vmovx.f16 s28, s24
+; CHECK-NEXT:    vins.f16 s24, s28
+; CHECK-NEXT:    vldr.16 s28, [r1, #6]
+; CHECK-NEXT:    vins.f16 s25, s28
+; CHECK-NEXT:    b .LBB2_2
+; CHECK-NEXT:  .LBB2_19: @ %middle.block
 ; CHECK-NEXT:    vdup.32 q0, r12
 ; CHECK-NEXT:    vcmp.u32 cs, q0, q4
 ; CHECK-NEXT:    vpsel q0, q5, q3
@@ -515,16 +473,16 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vadd.f32 q0, q0, r0
-; CHECK-NEXT:    b .LBB2_23
-; CHECK-NEXT:  .LBB2_22:
+; CHECK-NEXT:    b .LBB2_21
+; CHECK-NEXT:  .LBB2_20:
 ; CHECK-NEXT:    vldr s0, .LCPI2_0
-; CHECK-NEXT:  .LBB2_23: @ %for.cond.cleanup
+; CHECK-NEXT:  .LBB2_21: @ %for.cond.cleanup
 ; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    add sp, #32
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  @ %bb.24:
+; CHECK-NEXT:  @ %bb.22:
 ; CHECK-NEXT:  .LCPI2_1:
 ; CHECK-NEXT:    .long 0 @ 0x0
 ; CHECK-NEXT:    .long 1 @ 0x1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
index 2c16b818d557..aa7b8fb1065d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
@@ -806,38 +806,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fdiv_f16(<8 x half> %in1, <8 x half> %in2) {
 ; CHECK-LABEL: fdiv_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vdiv.f16 s8, s0, s4
-; CHECK-NEXT:    vmovx.f16 s10, s0
-; CHECK-NEXT:    vmov r0, s8
 ; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vdiv.f16 s8, s10, s8
-; CHECK-NEXT:    vdiv.f16 s12, s1, s5
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s10, s0
+; CHECK-NEXT:    vdiv.f16 s12, s10, s8
+; CHECK-NEXT:    vdiv.f16 s8, s0, s4
+; CHECK-NEXT:    vins.f16 s8, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s5
 ; CHECK-NEXT:    vmovx.f16 s14, s1
-; CHECK-NEXT:    vmov.16 q2[1], r1
+; CHECK-NEXT:    vdiv.f16 s9, s1, s5
 ; CHECK-NEXT:    vdiv.f16 s12, s14, s12
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vdiv.f16 s12, s2, s6
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s6
 ; CHECK-NEXT:    vmovx.f16 s14, s2
+; CHECK-NEXT:    vins.f16 s9, s12
+; CHECK-NEXT:    vmovx.f16 s12, s6
 ; CHECK-NEXT:    vdiv.f16 s12, s14, s12
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vdiv.f16 s12, s3, s7
-; CHECK-NEXT:    vmovx.f16 s4, s7
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vdiv.f16 s0, s0, s4
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vdiv.f16 s10, s2, s6
+; CHECK-NEXT:    vins.f16 s10, s12
+; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vmovx.f16 s14, s3
+; CHECK-NEXT:    vdiv.f16 s11, s3, s7
+; CHECK-NEXT:    vdiv.f16 s12, s14, s12
+; CHECK-NEXT:    vins.f16 s11, s12
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-fmas.ll b/llvm/test/CodeGen/Thumb2/mve-fmas.ll
index 325239377da2..241ebc4fe144 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fmas.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fmas.ll
@@ -17,49 +17,30 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1(<8 x half> %src1, <8 x half> %src2,
 ;
 ; CHECK-MVE-LABEL: vfma16_v1:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
 ; CHECK-MVE-NEXT:    vmovx.f16 s13, s0
-; CHECK-MVE-NEXT:    vmla.f16 s0, s4, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
-; CHECK-MVE-NEXT:    vmov.f32 s16, s1
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmov r1, s0
-; CHECK-MVE-NEXT:    vmla.f16 s16, s5, s9
-; CHECK-MVE-NEXT:    vmov r0, s13
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r1
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
-; CHECK-MVE-NEXT:    vmov r0, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s9
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s1
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
-; CHECK-MVE-NEXT:    vmla.f16 s20, s18, s16
-; CHECK-MVE-NEXT:    vmov.f32 s16, s2
-; CHECK-MVE-NEXT:    vmov r0, s20
-; CHECK-MVE-NEXT:    vmla.f16 s16, s6, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s2
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
-; CHECK-MVE-NEXT:    vmla.f16 s20, s18, s16
-; CHECK-MVE-NEXT:    vmov.f32 s16, s3
-; CHECK-MVE-NEXT:    vmov r0, s20
-; CHECK-MVE-NEXT:    vmla.f16 s16, s7, s11
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s16
 ; CHECK-MVE-NEXT:    vmla.f16 s0, s4, s8
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
+; CHECK-MVE-NEXT:    vmla.f16 s1, s5, s9
+; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
+; CHECK-MVE-NEXT:    vins.f16 s1, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vmla.f16 s2, s6, s10
+; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vmla.f16 s3, s7, s11
+; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
+; CHECK-MVE-NEXT:    vins.f16 s3, s13
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -81,49 +62,30 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2(<8 x half> %src1, <8 x half> %src2,
 ;
 ; CHECK-MVE-LABEL: vfma16_v2:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
 ; CHECK-MVE-NEXT:    vmovx.f16 s13, s0
-; CHECK-MVE-NEXT:    vmla.f16 s0, s4, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
-; CHECK-MVE-NEXT:    vmov.f32 s16, s1
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmov r1, s0
-; CHECK-MVE-NEXT:    vmla.f16 s16, s5, s9
-; CHECK-MVE-NEXT:    vmov r0, s13
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r1
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
-; CHECK-MVE-NEXT:    vmov r0, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s9
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s1
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
-; CHECK-MVE-NEXT:    vmla.f16 s20, s18, s16
-; CHECK-MVE-NEXT:    vmov.f32 s16, s2
-; CHECK-MVE-NEXT:    vmov r0, s20
-; CHECK-MVE-NEXT:    vmla.f16 s16, s6, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s2
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
-; CHECK-MVE-NEXT:    vmla.f16 s20, s18, s16
-; CHECK-MVE-NEXT:    vmov.f32 s16, s3
-; CHECK-MVE-NEXT:    vmov r0, s20
-; CHECK-MVE-NEXT:    vmla.f16 s16, s7, s11
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s16
 ; CHECK-MVE-NEXT:    vmla.f16 s0, s4, s8
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
+; CHECK-MVE-NEXT:    vmla.f16 s1, s5, s9
+; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
+; CHECK-MVE-NEXT:    vins.f16 s1, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vmla.f16 s2, s6, s10
+; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vmla.f16 s3, s7, s11
+; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
+; CHECK-MVE-NEXT:    vins.f16 s3, s13
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -145,49 +107,30 @@ define arm_aapcs_vfpcc <8 x half> @vfms16(<8 x half> %src1, <8 x half> %src2, <8
 ;
 ; CHECK-MVE-LABEL: vfms16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
 ; CHECK-MVE-NEXT:    vmovx.f16 s13, s0
-; CHECK-MVE-NEXT:    vmls.f16 s0, s4, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
-; CHECK-MVE-NEXT:    vmov.f32 s16, s1
-; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmov r1, s0
-; CHECK-MVE-NEXT:    vmls.f16 s16, s5, s9
-; CHECK-MVE-NEXT:    vmov r0, s13
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r1
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
-; CHECK-MVE-NEXT:    vmov r0, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s9
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s1
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
-; CHECK-MVE-NEXT:    vmls.f16 s20, s18, s16
-; CHECK-MVE-NEXT:    vmov.f32 s16, s2
-; CHECK-MVE-NEXT:    vmov r0, s20
-; CHECK-MVE-NEXT:    vmls.f16 s16, s6, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s2
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
-; CHECK-MVE-NEXT:    vmls.f16 s20, s18, s16
-; CHECK-MVE-NEXT:    vmov.f32 s16, s3
-; CHECK-MVE-NEXT:    vmov r0, s20
-; CHECK-MVE-NEXT:    vmls.f16 s16, s7, s11
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s16
 ; CHECK-MVE-NEXT:    vmls.f16 s0, s4, s8
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
+; CHECK-MVE-NEXT:    vmls.f16 s1, s5, s9
+; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
+; CHECK-MVE-NEXT:    vins.f16 s1, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vmls.f16 s2, s6, s10
+; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vmls.f16 s3, s7, s11
+; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
+; CHECK-MVE-NEXT:    vins.f16 s3, s13
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -213,44 +156,27 @@ define arm_aapcs_vfpcc <8 x half> @vfmar16(<8 x half> %src1, <8 x half> %src2, f
 ;
 ; CHECK-MVE-LABEL: vfmar16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s12, s8
-; CHECK-MVE-NEXT:    vmov.f32 s8, s0
-; CHECK-MVE-NEXT:    vmla.f16 s8, s4, s12
-; CHECK-MVE-NEXT:    vmov.f32 s14, s1
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s0
-; CHECK-MVE-NEXT:    vmla.f16 s14, s5, s12
-; CHECK-MVE-NEXT:    vmla.f16 s10, s8, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
-; CHECK-MVE-NEXT:    vmov r1, s10
-; CHECK-MVE-NEXT:    vmov.16 q2[0], r0
-; CHECK-MVE-NEXT:    vmov r0, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vmov.16 q2[1], r1
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmov.f32 s14, s2
-; CHECK-MVE-NEXT:    vmov.16 q2[2], r0
-; CHECK-MVE-NEXT:    vmov r0, s13
-; CHECK-MVE-NEXT:    vmla.f16 s14, s6, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmov.f32 s14, s3
-; CHECK-MVE-NEXT:    vmov.16 q2[4], r0
-; CHECK-MVE-NEXT:    vmov r0, s13
-; CHECK-MVE-NEXT:    vmla.f16 s14, s7, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q2[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s14
-; CHECK-MVE-NEXT:    vmla.f16 s0, s4, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q2[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s8, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vmla.f16 s0, s4, s8
+; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
+; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s8
+; CHECK-MVE-NEXT:    vmla.f16 s1, s5, s8
+; CHECK-MVE-NEXT:    vins.f16 s1, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s6
+; CHECK-MVE-NEXT:    vmla.f16 s2, s6, s8
+; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s7
+; CHECK-MVE-NEXT:    vins.f16 s2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s3
+; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s8
+; CHECK-MVE-NEXT:    vmla.f16 s3, s7, s8
+; CHECK-MVE-NEXT:    vins.f16 s3, s12
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %src3 = fptrunc float %src3o to half
@@ -279,47 +205,35 @@ define arm_aapcs_vfpcc <8 x half> @vfma16(<8 x half> %src1, <8 x half> %src2, fl
 ;
 ; CHECK-MVE-LABEL: vfma16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s12, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s0
-; CHECK-MVE-NEXT:    vmov.f32 s8, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
-; CHECK-MVE-NEXT:    vmla.f16 s8, s0, s4
-; CHECK-MVE-NEXT:    vmov.f32 s14, s12
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s4
-; CHECK-MVE-NEXT:    vmla.f16 s14, s10, s8
-; CHECK-MVE-NEXT:    vmov.16 q2[0], r0
-; CHECK-MVE-NEXT:    vmov r1, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.f32 s14, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[1], r1
-; CHECK-MVE-NEXT:    vmla.f16 s14, s1, s5
-; CHECK-MVE-NEXT:    vmov.f32 s15, s12
-; CHECK-MVE-NEXT:    vmov r0, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vmla.f16 s15, s13, s14
-; CHECK-MVE-NEXT:    vmov.f32 s14, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[2], r0
-; CHECK-MVE-NEXT:    vmov r0, s15
-; CHECK-MVE-NEXT:    vmla.f16 s14, s2, s6
-; CHECK-MVE-NEXT:    vmov.16 q2[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vmov.f32 s15, s12
-; CHECK-MVE-NEXT:    vmla.f16 s15, s13, s14
-; CHECK-MVE-NEXT:    vmov.f32 s14, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[4], r0
-; CHECK-MVE-NEXT:    vmov r0, s15
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmla.f16 s14, s3, s7
-; CHECK-MVE-NEXT:    vmov.16 q2[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s14
-; CHECK-MVE-NEXT:    vmla.f16 s12, s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q2[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vmov q3, q0
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s8
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vmov.f32 s0, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s9, s12
+; CHECK-MVE-NEXT:    vmla.f16 s8, s9, s10
+; CHECK-MVE-NEXT:    vmla.f16 s0, s12, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmov.f32 s9, s3
+; CHECK-MVE-NEXT:    vmov.f32 s1, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s13
+; CHECK-MVE-NEXT:    vmla.f16 s1, s13, s5
+; CHECK-MVE-NEXT:    vmla.f16 s9, s10, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s9
+; CHECK-MVE-NEXT:    vmov.f32 s9, s3
+; CHECK-MVE-NEXT:    vmov.f32 s2, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s14
+; CHECK-MVE-NEXT:    vmla.f16 s9, s10, s8
+; CHECK-MVE-NEXT:    vmla.f16 s2, s14, s6
+; CHECK-MVE-NEXT:    vins.f16 s2, s9
+; CHECK-MVE-NEXT:    vmov.f32 s9, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s15
+; CHECK-MVE-NEXT:    vmla.f16 s9, s10, s8
+; CHECK-MVE-NEXT:    vmla.f16 s3, s15, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s9
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %src3 = fptrunc float %src3o to half
@@ -489,50 +403,33 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1_pred(<8 x half> %src1, <8 x half> %
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
-; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
-; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s0, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s0
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    cset r2, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmov.f32 s15, s13
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
-; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s15
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s13, s15
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s16, s1
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s16, s5, s9
-; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmov.f32 s12, s0
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s1, s16
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r2
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s0, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s1
@@ -540,71 +437,76 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1_pred(<8 x half> %src1, <8 x half> %
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s9
 ; CHECK-MVE-NEXT:    vmov.f32 s22, s20
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmov.f32 s16, s2
-; CHECK-MVE-NEXT:    vmla.f16 s16, s6, s10
+; CHECK-MVE-NEXT:    vmov.f32 s18, s1
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s2, s16
+; CHECK-MVE-NEXT:    vmla.f16 s18, s5, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s1, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s10
 ; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
-; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmov.f32 s16, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmla.f16 s16, s7, s11
+; CHECK-MVE-NEXT:    vmov.f32 s18, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s3, s16
+; CHECK-MVE-NEXT:    vmla.f16 s18, s6, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s2, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s3
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s11
+; CHECK-MVE-NEXT:    vmov.f32 s22, s20
+; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
-; CHECK-MVE-NEXT:    vmov.f32 s2, s0
+; CHECK-MVE-NEXT:    vmov.f32 s18, s3
+; CHECK-MVE-NEXT:    vmla.f16 s18, s7, s11
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmla.f16 s2, s4, s8
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s2
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s3, s18
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -634,50 +536,33 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2_pred(<8 x half> %src1, <8 x half> %
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
-; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
-; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s0, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s0
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    cset r2, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmov.f32 s15, s13
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
-; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s15
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s13, s15
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s16, s1
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s16, s5, s9
-; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmov.f32 s12, s0
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s1, s16
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r2
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s0, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s1
@@ -685,71 +570,76 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2_pred(<8 x half> %src1, <8 x half> %
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s9
 ; CHECK-MVE-NEXT:    vmov.f32 s22, s20
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmov.f32 s16, s2
-; CHECK-MVE-NEXT:    vmla.f16 s16, s6, s10
+; CHECK-MVE-NEXT:    vmov.f32 s18, s1
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s2, s16
+; CHECK-MVE-NEXT:    vmla.f16 s18, s5, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s1, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s10
 ; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
-; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmov.f32 s16, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmla.f16 s16, s7, s11
+; CHECK-MVE-NEXT:    vmov.f32 s18, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s3, s16
+; CHECK-MVE-NEXT:    vmla.f16 s18, s6, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s2, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s3
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s11
+; CHECK-MVE-NEXT:    vmov.f32 s22, s20
+; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
-; CHECK-MVE-NEXT:    vmov.f32 s2, s0
+; CHECK-MVE-NEXT:    vmov.f32 s18, s3
+; CHECK-MVE-NEXT:    vmla.f16 s18, s7, s11
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmla.f16 s2, s4, s8
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s2
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s3, s18
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -779,50 +669,33 @@ define arm_aapcs_vfpcc <8 x half> @vfms16_pred(<8 x half> %src1, <8 x half> %src
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
-; CHECK-MVE-NEXT:    vmls.f16 s12, s4, s8
-; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s0, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s0
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    cset r2, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmov.f32 s15, s13
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vmls.f16 s15, s14, s12
-; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s15
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s13, s15
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s16, s1
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmls.f16 s16, s5, s9
-; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmov.f32 s12, s0
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s1, s16
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmls.f16 s12, s4, s8
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r2
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s0, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s1
@@ -830,71 +703,76 @@ define arm_aapcs_vfpcc <8 x half> @vfms16_pred(<8 x half> %src1, <8 x half> %src
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s9
 ; CHECK-MVE-NEXT:    vmov.f32 s22, s20
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vmls.f16 s22, s18, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmov.f32 s16, s2
-; CHECK-MVE-NEXT:    vmls.f16 s16, s6, s10
+; CHECK-MVE-NEXT:    vmov.f32 s18, s1
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s2, s16
+; CHECK-MVE-NEXT:    vmls.f16 s18, s5, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s1, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s10
 ; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmls.f16 s22, s18, s16
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmls.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
-; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmov.f32 s16, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmls.f16 s16, s7, s11
+; CHECK-MVE-NEXT:    vmov.f32 s18, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s3, s16
+; CHECK-MVE-NEXT:    vmls.f16 s18, s6, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s2, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s3
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s11
+; CHECK-MVE-NEXT:    vmov.f32 s22, s20
+; CHECK-MVE-NEXT:    vmls.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s11
-; CHECK-MVE-NEXT:    vmov.f32 s2, s0
+; CHECK-MVE-NEXT:    vmov.f32 s18, s3
+; CHECK-MVE-NEXT:    vmls.f16 s18, s7, s11
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmls.f16 s2, s4, s8
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s2
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s3, s18
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -927,119 +805,107 @@ define arm_aapcs_vfpcc <8 x half> @vfmar16_pred(<8 x half> %src1, <8 x half> %sr
 ;
 ; CHECK-MVE-LABEL: vfmar16_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s4
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s12, s8
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmov.f32 s8, s0
-; CHECK-MVE-NEXT:    vmla.f16 s8, s4, s12
-; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s8, s0, s8
-; CHECK-MVE-NEXT:    movs r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s4
-; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
 ; CHECK-MVE-NEXT:    vmovx.f16 s10, s0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmov.f32 s14, s10
-; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vmla.f16 s14, s8, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s8, s10, s14
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s8
-; CHECK-MVE-NEXT:    vmov.16 q2[0], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s14, s1
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s14, s5, s12
+; CHECK-MVE-NEXT:    vmov.f32 s8, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmov.16 q2[1], r2
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s1, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
-; CHECK-MVE-NEXT:    vmov r1, s14
+; CHECK-MVE-NEXT:    vmla.f16 s8, s4, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s8, s0, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s8, s14
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmov.16 q2[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmov.f32 s15, s13
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
-; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
-; CHECK-MVE-NEXT:    vmov r1, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q2[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s13, s1
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmov.f32 s14, s2
-; CHECK-MVE-NEXT:    vmla.f16 s14, s6, s12
+; CHECK-MVE-NEXT:    vmla.f16 s13, s5, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s2, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vmov r1, s14
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s9, s1, s13
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s9, s14
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmov.16 q2[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmov.f32 s15, s13
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
-; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vmov r1, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q2[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s13, s2
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmov.f32 s14, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmla.f16 s14, s7, s12
+; CHECK-MVE-NEXT:    vmla.f16 s13, s6, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s3, s14
-; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s10, s2, s13
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s10, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s15, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov.f32 s2, s0
-; CHECK-MVE-NEXT:    vmla.f16 s2, s4, s12
+; CHECK-MVE-NEXT:    vmov.f32 s13, s3
+; CHECK-MVE-NEXT:    vmla.f16 s13, s7, s12
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov r1, s14
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s2
-; CHECK-MVE-NEXT:    vmov.16 q2[6], r1
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q2[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s11, s3, s13
+; CHECK-MVE-NEXT:    vins.f16 s11, s14
 ; CHECK-MVE-NEXT:    vmov q0, q2
 ; CHECK-MVE-NEXT:    bx lr
 entry:
@@ -1073,118 +939,106 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_pred(<8 x half> %src1, <8 x half> %src
 ;
 ; CHECK-MVE-LABEL: vfma16_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f16 s10, #0
 ; CHECK-MVE-NEXT:    vcvtb.f16.f32 s12, s8
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmov.f32 s8, s12
-; CHECK-MVE-NEXT:    vmla.f16 s8, s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s8, s0, s8
-; CHECK-MVE-NEXT:    movs r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s4
-; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s0
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s14, s12
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmla.f16 s14, s10, s8
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s8, s10, s14
+; CHECK-MVE-NEXT:    vmla.f16 s8, s14, s10
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s14, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s8
-; CHECK-MVE-NEXT:    vmov.16 q2[0], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s14, s12
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s14, s1, s5
+; CHECK-MVE-NEXT:    vmov.f32 s8, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmov.16 q2[1], r2
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s1, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
-; CHECK-MVE-NEXT:    vmov r1, s14
+; CHECK-MVE-NEXT:    vmla.f16 s8, s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s8, s0, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s8, s14
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmov.16 q2[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmov.f32 s15, s12
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmla.f16 s15, s13, s14
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
-; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
-; CHECK-MVE-NEXT:    vmov r1, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q2[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s13, s12
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmov.f32 s14, s12
-; CHECK-MVE-NEXT:    vmla.f16 s14, s2, s6
+; CHECK-MVE-NEXT:    vmla.f16 s13, s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s2, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vmov r1, s14
+; CHECK-MVE-NEXT:    vmov.f32 s15, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s9, s1, s13
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s9, s14
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmov.16 q2[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmov.f32 s15, s12
 ; CHECK-MVE-NEXT:    vmla.f16 s15, s13, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
-; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vmov r1, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q2[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s13, s12
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmov.f32 s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmla.f16 s14, s3, s7
+; CHECK-MVE-NEXT:    vmla.f16 s13, s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s3, s14
-; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmov.f32 s15, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s10, s2, s13
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s10, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmla.f16 s15, s13, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmla.f16 s12, s3, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmla.f16 s12, s0, s4
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov r1, s14
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[6], r1
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q2[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s11, s3, s12
+; CHECK-MVE-NEXT:    vins.f16 s11, s14
 ; CHECK-MVE-NEXT:    vmov q0, q2
 ; CHECK-MVE-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
index 27a4ff75c980..685a966eb3ed 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
@@ -19,35 +19,23 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @sqrt_float16_t(<8 x half> %src) {
 ; CHECK-LABEL: sqrt_float16_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vsqrt.f16 s8, s1
-; CHECK-NEXT:    vsqrt.f16 s4, s4
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vsqrt.f16 s4, s0
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vsqrt.f16 s0, s0
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vsqrt.f16 s8, s0
+; CHECK-NEXT:    vsqrt.f16 s0, s4
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s5
 ; CHECK-NEXT:    vsqrt.f16 s8, s8
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vsqrt.f16 s8, s2
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vsqrt.f16 s1, s5
+; CHECK-NEXT:    vins.f16 s1, s8
+; CHECK-NEXT:    vmovx.f16 s8, s6
 ; CHECK-NEXT:    vsqrt.f16 s8, s8
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vsqrt.f16 s8, s3
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vsqrt.f16 s2, s6
+; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vmovx.f16 s8, s7
+; CHECK-NEXT:    vsqrt.f16 s8, s8
+; CHECK-NEXT:    vsqrt.f16 s3, s7
+; CHECK-NEXT:    vins.f16 s3, s8
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = call fast <8 x half> @llvm.sqrt.v8f16(<8 x half> %src)
@@ -1037,93 +1025,81 @@ define arm_aapcs_vfpcc <8 x half> @copysign_float16_t(<8 x half> %src1, <8 x hal
 ; CHECK-NEXT:    .pad #32
 ; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vstr.16 s4, [sp, #28]
 ; CHECK-NEXT:    vstr.16 s8, [sp, #24]
 ; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vstr.16 s5, [sp, #20]
+; CHECK-NEXT:    vstr.16 s4, [sp, #28]
 ; CHECK-NEXT:    vstr.16 s8, [sp, #16]
 ; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vmovx.f16 s4, s7
-; CHECK-NEXT:    vstr.16 s6, [sp, #12]
+; CHECK-NEXT:    vstr.16 s5, [sp, #20]
 ; CHECK-NEXT:    vstr.16 s8, [sp, #8]
+; CHECK-NEXT:    vmovx.f16 s8, s7
+; CHECK-NEXT:    vstr.16 s6, [sp, #12]
+; CHECK-NEXT:    vstr.16 s8, [sp]
 ; CHECK-NEXT:    vstr.16 s7, [sp, #4]
-; CHECK-NEXT:    vstr.16 s4, [sp]
-; CHECK-NEXT:    ldrb.w r0, [sp, #29]
-; CHECK-NEXT:    vabs.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    ldrb.w r0, [sp, #25]
+; CHECK-NEXT:    vabs.f16 s4, s4
 ; CHECK-NEXT:    vneg.f16 s6, s4
-; CHECK-NEXT:    ldrb.w r1, [sp, #25]
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vabs.f16 s8, s1
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vneg.f16 s10, s8
 ; CHECK-NEXT:    lsls r0, r0, #31
-; CHECK-NEXT:    vseleq.f16 s4, s4, s6
-; CHECK-NEXT:    tst.w r1, #128
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    cset r1, ne
-; CHECK-NEXT:    vabs.f16 s4, s4
-; CHECK-NEXT:    vneg.f16 s6, s4
-; CHECK-NEXT:    lsls r1, r1, #31
-; CHECK-NEXT:    vseleq.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    ldrb.w r0, [sp, #21]
-; CHECK-NEXT:    vmov.16 q1[1], r1
-; CHECK-NEXT:    vabs.f16 s0, s0
+; CHECK-NEXT:    ldrb.w r0, [sp, #29]
+; CHECK-NEXT:    vseleq.f16 s8, s4, s6
+; CHECK-NEXT:    vabs.f16 s4, s0
+; CHECK-NEXT:    vabs.f16 s0, s3
 ; CHECK-NEXT:    tst.w r0, #128
+; CHECK-NEXT:    vneg.f16 s6, s4
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    lsls r0, r0, #31
-; CHECK-NEXT:    vseleq.f16 s8, s8, s10
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmov.16 q1[2], r0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #17]
-; CHECK-NEXT:    vabs.f16 s8, s8
+; CHECK-NEXT:    vseleq.f16 s4, s4, s6
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vmovx.f16 s8, s1
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vneg.f16 s10, s8
+; CHECK-NEXT:    vabs.f16 s8, s8
 ; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    vneg.f16 s10, s8
 ; CHECK-NEXT:    lsls r0, r0, #31
+; CHECK-NEXT:    ldrb.w r0, [sp, #21]
 ; CHECK-NEXT:    vseleq.f16 s8, s8, s10
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vabs.f16 s8, s2
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    ldrb.w r0, [sp, #13]
-; CHECK-NEXT:    vneg.f16 s10, s8
+; CHECK-NEXT:    vabs.f16 s10, s1
 ; CHECK-NEXT:    tst.w r0, #128
+; CHECK-NEXT:    vneg.f16 s12, s10
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    lsls r0, r0, #31
-; CHECK-NEXT:    vseleq.f16 s8, s8, s10
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vmov.16 q1[4], r0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #9]
+; CHECK-NEXT:    vseleq.f16 s5, s10, s12
+; CHECK-NEXT:    vins.f16 s5, s8
+; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    tst.w r0, #128
 ; CHECK-NEXT:    vabs.f16 s8, s8
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    vneg.f16 s10, s8
+; CHECK-NEXT:    lsls r0, r0, #31
+; CHECK-NEXT:    ldrb.w r0, [sp, #13]
+; CHECK-NEXT:    vseleq.f16 s8, s8, s10
+; CHECK-NEXT:    vabs.f16 s10, s2
 ; CHECK-NEXT:    vneg.f16 s2, s0
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vneg.f16 s10, s8
+; CHECK-NEXT:    vneg.f16 s12, s10
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    lsls r0, r0, #31
-; CHECK-NEXT:    vseleq.f16 s8, s8, s10
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vabs.f16 s8, s3
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    ldrb.w r0, [sp, #5]
-; CHECK-NEXT:    vneg.f16 s10, s8
+; CHECK-NEXT:    ldrb.w r0, [sp, #1]
+; CHECK-NEXT:    vseleq.f16 s6, s10, s12
+; CHECK-NEXT:    vins.f16 s6, s8
+; CHECK-NEXT:    vmovx.f16 s8, s3
 ; CHECK-NEXT:    tst.w r0, #128
+; CHECK-NEXT:    vabs.f16 s8, s8
 ; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    vneg.f16 s10, s8
 ; CHECK-NEXT:    lsls r0, r0, #31
+; CHECK-NEXT:    ldrb.w r0, [sp, #5]
 ; CHECK-NEXT:    vseleq.f16 s8, s8, s10
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    ldrb.w r0, [sp, #1]
 ; CHECK-NEXT:    tst.w r0, #128
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    lsls r0, r0, #31
-; CHECK-NEXT:    vseleq.f16 s0, s0, s2
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vseleq.f16 s7, s0, s2
+; CHECK-NEXT:    vins.f16 s7, s8
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll
index f88242fb108c..1ec651ac06de 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll
@@ -5,35 +5,23 @@
 define arm_aapcs_vfpcc <8 x half> @fneg_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: fneg_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
-; CHECK-MVE-NEXT:    vneg.f16 s8, s1
-; CHECK-MVE-NEXT:    vneg.f16 s4, s4
-; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vneg.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov r1, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[0], r1
-; CHECK-MVE-NEXT:    vneg.f16 s0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[1], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
-; CHECK-MVE-NEXT:    vmov.16 q1[2], r0
+; CHECK-MVE-NEXT:    vmov q1, q0
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
+; CHECK-MVE-NEXT:    vneg.f16 s8, s0
+; CHECK-MVE-NEXT:    vneg.f16 s0, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    vneg.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vneg.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[4], r0
+; CHECK-MVE-NEXT:    vneg.f16 s1, s5
+; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
 ; CHECK-MVE-NEXT:    vneg.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vneg.f16 s8, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vneg.f16 s2, s6
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
+; CHECK-MVE-NEXT:    vneg.f16 s8, s8
+; CHECK-MVE-NEXT:    vneg.f16 s3, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fneg_float16_t:
@@ -89,35 +77,23 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fabs_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: fabs_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
-; CHECK-MVE-NEXT:    vabs.f16 s8, s1
-; CHECK-MVE-NEXT:    vabs.f16 s4, s4
-; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vabs.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov r1, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[0], r1
-; CHECK-MVE-NEXT:    vabs.f16 s0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[1], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
-; CHECK-MVE-NEXT:    vmov.16 q1[2], r0
+; CHECK-MVE-NEXT:    vmov q1, q0
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
+; CHECK-MVE-NEXT:    vabs.f16 s8, s0
+; CHECK-MVE-NEXT:    vabs.f16 s0, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    vabs.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vabs.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[4], r0
+; CHECK-MVE-NEXT:    vabs.f16 s1, s5
+; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
 ; CHECK-MVE-NEXT:    vabs.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vabs.f16 s8, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vabs.f16 s2, s6
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
+; CHECK-MVE-NEXT:    vabs.f16 s8, s8
+; CHECK-MVE-NEXT:    vabs.f16 s3, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fabs_float16_t:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-frint.ll b/llvm/test/CodeGen/Thumb2/mve-frint.ll
index 410c4f18bd00..8cfc1418d4d9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-frint.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-frint.ll
@@ -24,35 +24,23 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fceil_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: fceil_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
-; CHECK-MVE-NEXT:    vrintp.f16 s8, s1
-; CHECK-MVE-NEXT:    vrintp.f16 s4, s4
-; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vrintp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov r1, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[0], r1
-; CHECK-MVE-NEXT:    vrintp.f16 s0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[1], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
-; CHECK-MVE-NEXT:    vmov.16 q1[2], r0
+; CHECK-MVE-NEXT:    vmov q1, q0
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
+; CHECK-MVE-NEXT:    vrintp.f16 s8, s0
+; CHECK-MVE-NEXT:    vrintp.f16 s0, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    vrintp.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vrintp.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[4], r0
+; CHECK-MVE-NEXT:    vrintp.f16 s1, s5
+; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
 ; CHECK-MVE-NEXT:    vrintp.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vrintp.f16 s8, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrintp.f16 s2, s6
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
+; CHECK-MVE-NEXT:    vrintp.f16 s8, s8
+; CHECK-MVE-NEXT:    vrintp.f16 s3, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fceil_float16_t:
@@ -110,35 +98,23 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @ftrunc_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: ftrunc_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
-; CHECK-MVE-NEXT:    vrintz.f16 s8, s1
-; CHECK-MVE-NEXT:    vrintz.f16 s4, s4
-; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vrintz.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov r1, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[0], r1
-; CHECK-MVE-NEXT:    vrintz.f16 s0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[1], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
-; CHECK-MVE-NEXT:    vmov.16 q1[2], r0
+; CHECK-MVE-NEXT:    vmov q1, q0
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
+; CHECK-MVE-NEXT:    vrintz.f16 s8, s0
+; CHECK-MVE-NEXT:    vrintz.f16 s0, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    vrintz.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vrintz.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[4], r0
+; CHECK-MVE-NEXT:    vrintz.f16 s1, s5
+; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
 ; CHECK-MVE-NEXT:    vrintz.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vrintz.f16 s8, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrintz.f16 s2, s6
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
+; CHECK-MVE-NEXT:    vrintz.f16 s8, s8
+; CHECK-MVE-NEXT:    vrintz.f16 s3, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: ftrunc_float16_t:
@@ -196,35 +172,23 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @frint_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: frint_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
-; CHECK-MVE-NEXT:    vrintx.f16 s8, s1
-; CHECK-MVE-NEXT:    vrintx.f16 s4, s4
-; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vrintx.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov r1, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[0], r1
-; CHECK-MVE-NEXT:    vrintx.f16 s0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[1], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
-; CHECK-MVE-NEXT:    vmov.16 q1[2], r0
+; CHECK-MVE-NEXT:    vmov q1, q0
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
+; CHECK-MVE-NEXT:    vrintx.f16 s8, s0
+; CHECK-MVE-NEXT:    vrintx.f16 s0, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    vrintx.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vrintx.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[4], r0
+; CHECK-MVE-NEXT:    vrintx.f16 s1, s5
+; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
 ; CHECK-MVE-NEXT:    vrintx.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vrintx.f16 s8, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrintx.f16 s2, s6
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
+; CHECK-MVE-NEXT:    vrintx.f16 s8, s8
+; CHECK-MVE-NEXT:    vrintx.f16 s3, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: frint_float16_t:
@@ -277,35 +241,23 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fnearbyint_float16_t(<8 x half> %src) {
 ; CHECK-LABEL: fnearbyint_float16_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vrintr.f16 s8, s1
-; CHECK-NEXT:    vrintr.f16 s4, s4
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vrintr.f16 s4, s0
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vrintr.f16 s0, s0
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vrintr.f16 s8, s0
+; CHECK-NEXT:    vrintr.f16 s0, s4
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s5
 ; CHECK-NEXT:    vrintr.f16 s8, s8
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vrintr.f16 s8, s2
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vrintr.f16 s1, s5
+; CHECK-NEXT:    vins.f16 s1, s8
+; CHECK-NEXT:    vmovx.f16 s8, s6
 ; CHECK-NEXT:    vrintr.f16 s8, s8
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vrintr.f16 s8, s3
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vrintr.f16 s2, s6
+; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vmovx.f16 s8, s7
+; CHECK-NEXT:    vrintr.f16 s8, s8
+; CHECK-NEXT:    vrintr.f16 s3, s7
+; CHECK-NEXT:    vins.f16 s3, s8
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = call fast <8 x half> @llvm.nearbyint.v8f16(<8 x half> %src)
@@ -358,35 +310,23 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @ffloor_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: ffloor_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
-; CHECK-MVE-NEXT:    vrintm.f16 s8, s1
-; CHECK-MVE-NEXT:    vrintm.f16 s4, s4
-; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vrintm.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov r1, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[0], r1
-; CHECK-MVE-NEXT:    vrintm.f16 s0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[1], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
-; CHECK-MVE-NEXT:    vmov.16 q1[2], r0
+; CHECK-MVE-NEXT:    vmov q1, q0
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
+; CHECK-MVE-NEXT:    vrintm.f16 s8, s0
+; CHECK-MVE-NEXT:    vrintm.f16 s0, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    vrintm.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vrintm.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[4], r0
+; CHECK-MVE-NEXT:    vrintm.f16 s1, s5
+; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
 ; CHECK-MVE-NEXT:    vrintm.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vrintm.f16 s8, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrintm.f16 s2, s6
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
+; CHECK-MVE-NEXT:    vrintm.f16 s8, s8
+; CHECK-MVE-NEXT:    vrintm.f16 s3, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: ffloor_float16_t:
@@ -444,35 +384,23 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fround_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: fround_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
-; CHECK-MVE-NEXT:    vrinta.f16 s8, s1
-; CHECK-MVE-NEXT:    vrinta.f16 s4, s4
-; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vrinta.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov r1, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[0], r1
-; CHECK-MVE-NEXT:    vrinta.f16 s0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[1], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
-; CHECK-MVE-NEXT:    vmov.16 q1[2], r0
+; CHECK-MVE-NEXT:    vmov q1, q0
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
+; CHECK-MVE-NEXT:    vrinta.f16 s8, s0
+; CHECK-MVE-NEXT:    vrinta.f16 s0, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    vrinta.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vrinta.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
-; CHECK-MVE-NEXT:    vmov.16 q1[4], r0
+; CHECK-MVE-NEXT:    vrinta.f16 s1, s5
+; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
 ; CHECK-MVE-NEXT:    vrinta.f16 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vrinta.f16 s8, s3
-; CHECK-MVE-NEXT:    vmov.16 q1[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrinta.f16 s2, s6
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
+; CHECK-MVE-NEXT:    vrinta.f16 s8, s8
+; CHECK-MVE-NEXT:    vrinta.f16 s3, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fround_float16_t:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
index d9b76d5baca5..f416aac644fb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
@@ -95,40 +95,28 @@ define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_sext(i16* %base, <8 x i16>* %off
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
 ; CHECK-NEXT:    vadd.i32 q1, q0, r0
 ; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vldr.16 s0, [r2]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vldr.16 s0, [r3]
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.16 q0[0], r3
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vldr.16 s8, [r2]
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vldr.16 s4, [r2]
 ; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vldr.16 s8, [r2]
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
-; CHECK-NEXT:    vmov.16 q0[3], r2
+; CHECK-NEXT:    vldr.16 s1, [r2]
 ; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vldr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vins.f16 s1, s8
 ; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    vldr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vldr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vldr.16 s4, [r0]
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vldr.16 s2, [r0]
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vldr.16 s8, [r0]
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vldr.16 s3, [r0]
+; CHECK-NEXT:    vins.f16 s3, s8
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <8 x i16>, <8 x i16>* %offptr, align 2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
index dcf442716d92..cbb234176fda 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -376,38 +376,26 @@ define arm_aapcs_vfpcc <8 x half> @ptr_f16(<8 x half*>* %offptr) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vldr.16 s0, [r1]
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vldr.16 s0, [r2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov.16 q0[1], r1
-; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vldr.16 s8, [r1]
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov.16 q0[2], r1
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vmov r1, s4
+; CHECK-NEXT:    vldr.16 s0, [r1]
+; CHECK-NEXT:    vmov r1, s7
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vldr.16 s8, [r1]
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.16 q0[3], r1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vldr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vldr.16 s1, [r1]
 ; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vins.f16 s1, s8
 ; CHECK-NEXT:    vldr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vldr.16 s8, [r0]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vldr.16 s4, [r0]
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vldr.16 s2, [r0]
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vldr.16 s8, [r0]
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vldr.16 s3, [r0]
+; CHECK-NEXT:    vins.f16 s3, s8
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <8 x half*>, <8 x half*>* %offptr, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
index c7eed2631c32..b5f9d6676532 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
@@ -823,85 +823,45 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrh.s32 q0, [r1]
-; CHECK-NEXT:    mov.w lr, #0
-; CHECK-NEXT:    @ implicit-def: $q1
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-NEXT:    vmrs r3, p0
-; CHECK-NEXT:    and r1, r3, #1
+; CHECK-NEXT:    @ implicit-def: $q0
+; CHECK-NEXT:    vmrs lr, p0
+; CHECK-NEXT:    and r1, lr, #1
+; CHECK-NEXT:    ubfx r3, lr, #4, #1
 ; CHECK-NEXT:    rsb.w r12, r1, #0
-; CHECK-NEXT:    ubfx r1, r3, #4, #1
-; CHECK-NEXT:    bfi lr, r12, #0, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi lr, r1, #1, #1
-; CHECK-NEXT:    ubfx r1, r3, #8, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi lr, r1, #2, #1
-; CHECK-NEXT:    ubfx r1, r3, #12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi lr, r1, #3, #1
-; CHECK-NEXT:    lsls.w r1, lr, #31
-; CHECK-NEXT:    beq .LBB18_2
-; CHECK-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-NEXT:    vldr.16 s4, [r2]
-; CHECK-NEXT:  .LBB18_2: @ %else
-; CHECK-NEXT:    lsls.w r1, lr, #30
-; CHECK-NEXT:    bpl .LBB18_6
-; CHECK-NEXT:  @ %bb.3: @ %cond.load1
-; CHECK-NEXT:    vldr.16 s0, [r2, #2]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmovx.f16 s4, s5
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov.16 q0[0], r3
-; CHECK-NEXT:    vmov.16 q0[1], r1
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov.16 q0[2], r1
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov.16 q0[3], r1
-; CHECK-NEXT:    lsls.w r1, lr, #29
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r12, #0, #1
+; CHECK-NEXT:    bfi r1, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, lr, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #2, #1
+; CHECK-NEXT:    ubfx r3, lr, #12, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #3, #1
+; CHECK-NEXT:    lsls r3, r1, #31
+; CHECK-NEXT:    bne .LBB18_6
+; CHECK-NEXT:  @ %bb.1: @ %else
+; CHECK-NEXT:    lsls r3, r1, #30
 ; CHECK-NEXT:    bmi .LBB18_7
-; CHECK-NEXT:  .LBB18_4:
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    lsls.w r1, lr, #28
+; CHECK-NEXT:  .LBB18_2: @ %else2
+; CHECK-NEXT:    lsls r3, r1, #29
 ; CHECK-NEXT:    bmi .LBB18_8
-; CHECK-NEXT:  .LBB18_5:
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    b .LBB18_9
-; CHECK-NEXT:  .LBB18_6:
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    lsls.w r1, lr, #29
-; CHECK-NEXT:    bpl .LBB18_4
-; CHECK-NEXT:  .LBB18_7: @ %cond.load4
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vldr.16 s8, [r2, #4]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vmov.16 q1[1], r3
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    lsls.w r1, lr, #28
+; CHECK-NEXT:  .LBB18_3: @ %else5
+; CHECK-NEXT:    lsls r1, r1, #28
 ; CHECK-NEXT:    bpl .LBB18_5
-; CHECK-NEXT:  .LBB18_8: @ %cond.load7
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov.16 q2[0], r3
-; CHECK-NEXT:    vldr.16 s0, [r2, #6]
-; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:  .LBB18_9: @ %else8
+; CHECK-NEXT:  .LBB18_4: @ %cond.load7
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vldr.16 s4, [r2, #6]
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:  .LBB18_5: @ %else8
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    vcvtt.f32.f16 s3, s9
-; CHECK-NEXT:    vcvtb.f32.f16 s2, s9
-; CHECK-NEXT:    vcvtt.f32.f16 s1, s8
-; CHECK-NEXT:    vcvtb.f32.f16 s0, s8
+; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
 ; CHECK-NEXT:    and r3, r2, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    bfi r1, r3, #0, #1
@@ -916,22 +876,42 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s0
+; CHECK-NEXT:    vmovne r2, s4
 ; CHECK-NEXT:    strne r2, [r0]
 ; CHECK-NEXT:    lsls r2, r1, #30
 ; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s1
+; CHECK-NEXT:    vmovmi r2, s5
 ; CHECK-NEXT:    strmi r2, [r0, #4]
 ; CHECK-NEXT:    lsls r2, r1, #29
 ; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s2
+; CHECK-NEXT:    vmovmi r2, s6
 ; CHECK-NEXT:    strmi r2, [r0, #8]
 ; CHECK-NEXT:    lsls r1, r1, #28
 ; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s3
+; CHECK-NEXT:    vmovmi r1, s7
 ; CHECK-NEXT:    strmi r1, [r0, #12]
 ; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:  .LBB18_6: @ %cond.load
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    lsls r3, r1, #30
+; CHECK-NEXT:    bpl .LBB18_2
+; CHECK-NEXT:  .LBB18_7: @ %cond.load1
+; CHECK-NEXT:    vldr.16 s4, [r2, #2]
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    lsls r3, r1, #29
+; CHECK-NEXT:    bpl .LBB18_3
+; CHECK-NEXT:  .LBB18_8: @ %cond.load4
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vldr.16 s1, [r2, #4]
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    lsls r1, r1, #28
+; CHECK-NEXT:    bmi .LBB18_4
+; CHECK-NEXT:    b .LBB18_5
 entry:
   %0 = load <4 x i16>, <4 x i16>* %mask, align 2
   %1 = icmp sgt <4 x i16> %0, zeroinitializer
@@ -949,85 +929,45 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-NEXT:    .pad #8
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrh.s32 q0, [r1]
-; CHECK-NEXT:    mov.w lr, #0
-; CHECK-NEXT:    @ implicit-def: $q1
 ; CHECK-NEXT:    vcmp.s32 gt, q0, zr
-; CHECK-NEXT:    vmrs r3, p0
-; CHECK-NEXT:    and r1, r3, #1
+; CHECK-NEXT:    @ implicit-def: $q0
+; CHECK-NEXT:    vmrs lr, p0
+; CHECK-NEXT:    and r1, lr, #1
+; CHECK-NEXT:    ubfx r3, lr, #4, #1
 ; CHECK-NEXT:    rsb.w r12, r1, #0
-; CHECK-NEXT:    ubfx r1, r3, #4, #1
-; CHECK-NEXT:    bfi lr, r12, #0, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi lr, r1, #1, #1
-; CHECK-NEXT:    ubfx r1, r3, #8, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi lr, r1, #2, #1
-; CHECK-NEXT:    ubfx r1, r3, #12, #1
-; CHECK-NEXT:    rsbs r1, r1, #0
-; CHECK-NEXT:    bfi lr, r1, #3, #1
-; CHECK-NEXT:    lsls.w r1, lr, #31
-; CHECK-NEXT:    beq .LBB19_2
-; CHECK-NEXT:  @ %bb.1: @ %cond.load
-; CHECK-NEXT:    vldr.16 s4, [r2]
-; CHECK-NEXT:  .LBB19_2: @ %else
-; CHECK-NEXT:    lsls.w r1, lr, #30
-; CHECK-NEXT:    bpl .LBB19_6
-; CHECK-NEXT:  @ %bb.3: @ %cond.load1
-; CHECK-NEXT:    vldr.16 s0, [r2, #2]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmovx.f16 s4, s5
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov.16 q0[0], r3
-; CHECK-NEXT:    vmov.16 q0[1], r1
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov.16 q0[2], r1
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmov.16 q0[3], r1
-; CHECK-NEXT:    lsls.w r1, lr, #29
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r12, #0, #1
+; CHECK-NEXT:    bfi r1, r3, #1, #1
+; CHECK-NEXT:    ubfx r3, lr, #8, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #2, #1
+; CHECK-NEXT:    ubfx r3, lr, #12, #1
+; CHECK-NEXT:    rsbs r3, r3, #0
+; CHECK-NEXT:    bfi r1, r3, #3, #1
+; CHECK-NEXT:    lsls r3, r1, #31
+; CHECK-NEXT:    bne .LBB19_6
+; CHECK-NEXT:  @ %bb.1: @ %else
+; CHECK-NEXT:    lsls r3, r1, #30
 ; CHECK-NEXT:    bmi .LBB19_7
-; CHECK-NEXT:  .LBB19_4:
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    lsls.w r1, lr, #28
+; CHECK-NEXT:  .LBB19_2: @ %else2
+; CHECK-NEXT:    lsls r3, r1, #29
 ; CHECK-NEXT:    bmi .LBB19_8
-; CHECK-NEXT:  .LBB19_5:
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    b .LBB19_9
-; CHECK-NEXT:  .LBB19_6:
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    lsls.w r1, lr, #29
-; CHECK-NEXT:    bpl .LBB19_4
-; CHECK-NEXT:  .LBB19_7: @ %cond.load4
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vldr.16 s8, [r2, #4]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vmov.16 q1[1], r3
-; CHECK-NEXT:    vmov r1, s8
-; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    lsls.w r1, lr, #28
+; CHECK-NEXT:  .LBB19_3: @ %else5
+; CHECK-NEXT:    lsls r1, r1, #28
 ; CHECK-NEXT:    bpl .LBB19_5
-; CHECK-NEXT:  .LBB19_8: @ %cond.load7
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov.16 q2[0], r3
-; CHECK-NEXT:    vldr.16 s0, [r2, #6]
-; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov r1, s5
-; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:  .LBB19_9: @ %else8
+; CHECK-NEXT:  .LBB19_4: @ %cond.load7
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vldr.16 s4, [r2, #6]
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:  .LBB19_5: @ %else8
 ; CHECK-NEXT:    vmrs r2, p0
 ; CHECK-NEXT:    movs r1, #0
-; CHECK-NEXT:    vcvtt.f32.f16 s3, s9
-; CHECK-NEXT:    vcvtb.f32.f16 s2, s9
-; CHECK-NEXT:    vcvtt.f32.f16 s1, s8
-; CHECK-NEXT:    vcvtb.f32.f16 s0, s8
+; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
 ; CHECK-NEXT:    and r3, r2, #1
 ; CHECK-NEXT:    rsbs r3, r3, #0
 ; CHECK-NEXT:    bfi r1, r3, #0, #1
@@ -1042,22 +982,42 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-NEXT:    lsls r2, r1, #31
 ; CHECK-NEXT:    itt ne
-; CHECK-NEXT:    vmovne r2, s0
+; CHECK-NEXT:    vmovne r2, s4
 ; CHECK-NEXT:    strne r2, [r0]
 ; CHECK-NEXT:    lsls r2, r1, #30
 ; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s1
+; CHECK-NEXT:    vmovmi r2, s5
 ; CHECK-NEXT:    strmi r2, [r0, #4]
 ; CHECK-NEXT:    lsls r2, r1, #29
 ; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r2, s2
+; CHECK-NEXT:    vmovmi r2, s6
 ; CHECK-NEXT:    strmi r2, [r0, #8]
 ; CHECK-NEXT:    lsls r1, r1, #28
 ; CHECK-NEXT:    itt mi
-; CHECK-NEXT:    vmovmi r1, s3
+; CHECK-NEXT:    vmovmi r1, s7
 ; CHECK-NEXT:    strmi r1, [r0, #12]
 ; CHECK-NEXT:    add sp, #8
 ; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:  .LBB19_6: @ %cond.load
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    lsls r3, r1, #30
+; CHECK-NEXT:    bpl .LBB19_2
+; CHECK-NEXT:  .LBB19_7: @ %cond.load1
+; CHECK-NEXT:    vldr.16 s4, [r2, #2]
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    lsls r3, r1, #29
+; CHECK-NEXT:    bpl .LBB19_3
+; CHECK-NEXT:  .LBB19_8: @ %cond.load4
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vldr.16 s1, [r2, #4]
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    lsls r1, r1, #28
+; CHECK-NEXT:    bmi .LBB19_4
+; CHECK-NEXT:    b .LBB19_5
 entry:
   %0 = load <4 x i16>, <4 x i16>* %mask, align 2
   %1 = icmp sgt <4 x i16> %0, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
index 75da80625c81..aab2f3ea729e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
@@ -315,38 +315,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @minnm_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: minnm_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vminnm.f16 s8, s4, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
-; CHECK-MVE-NEXT:    vmov r0, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
-; CHECK-MVE-NEXT:    vminnm.f16 s8, s10, s8
-; CHECK-MVE-NEXT:    vminnm.f16 s12, s5, s1
-; CHECK-MVE-NEXT:    vmov r1, s8
-; CHECK-MVE-NEXT:    vmov.16 q2[0], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vminnm.f16 s12, s10, s8
+; CHECK-MVE-NEXT:    vminnm.f16 s8, s4, s0
+; CHECK-MVE-NEXT:    vins.f16 s8, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vmov.16 q2[1], r1
+; CHECK-MVE-NEXT:    vminnm.f16 s9, s5, s1
 ; CHECK-MVE-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[2], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vminnm.f16 s12, s6, s2
-; CHECK-MVE-NEXT:    vmov.16 q2[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vins.f16 s9, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vminnm.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vminnm.f16 s10, s6, s2
+; CHECK-MVE-NEXT:    vins.f16 s10, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vminnm.f16 s11, s7, s3
 ; CHECK-MVE-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[4], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s7
-; CHECK-MVE-NEXT:    vminnm.f16 s12, s7, s3
-; CHECK-MVE-NEXT:    vmov.16 q2[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vminnm.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov.16 q2[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q2[7], r0
+; CHECK-MVE-NEXT:    vins.f16 s11, s12
 ; CHECK-MVE-NEXT:    vmov q0, q2
 ; CHECK-MVE-NEXT:    bx lr
 ;

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index 1b03b982badf..40f974fecab1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -62,23 +62,15 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) {
 ; CHECK-LABEL: shuffle1_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmovx.f16 s4, s3
+; CHECK-NEXT:    vins.f16 s4, s3
+; CHECK-NEXT:    vmovx.f16 s5, s2
+; CHECK-NEXT:    vins.f16 s5, s2
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s6, s1
+; CHECK-NEXT:    vmovx.f16 s7, s0
+; CHECK-NEXT:    vins.f16 s7, s0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -97,20 +89,16 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
 ; CHECK-LABEL: shuffle3_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmovx.f16 s5, s3
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vins.f16 s5, s3
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vins.f16 s1, s0
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
@@ -345,27 +333,15 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) {
 ; CHECK-LABEL: shuffle1_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s3
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmovx.f16 s0, s7
+; CHECK-NEXT:    vins.f16 s0, s7
+; CHECK-NEXT:    vmovx.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s2, s5
+; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -384,23 +360,15 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) {
 ; CHECK-LABEL: shuffle3_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s3
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmovx.f16 s1, s7
+; CHECK-NEXT:    vmovx.f16 s8, s4
+; CHECK-NEXT:    vins.f16 s1, s7
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s5, s4
+; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov.f32 s0, s6
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
index 4bde0e4d14de..ea079811c4bf 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
@@ -35,23 +35,15 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_76543210(<8 x i16> %s1, <8 x i16> %s2) {
 ; CHECK-LABEL: shuffle_i16_76543210:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmovx.f16 s4, s3
+; CHECK-NEXT:    vins.f16 s4, s3
+; CHECK-NEXT:    vmovx.f16 s5, s2
+; CHECK-NEXT:    vins.f16 s5, s2
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s6, s1
+; CHECK-NEXT:    vmovx.f16 s7, s0
+; CHECK-NEXT:    vins.f16 s7, s0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -346,27 +338,15 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle_f16_76543210(<8 x half> %s1, <8 x half> %s2) {
 ; CHECK-LABEL: shuffle_f16_76543210:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s3
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov r1, s4
-; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmovx.f16 s0, s7
+; CHECK-NEXT:    vins.f16 s0, s7
+; CHECK-NEXT:    vmovx.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s2, s5
+; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x half> %s1, <8 x half> %s2, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
index d6e25d2a6864..7e985eb3b205 100644
--- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
@@ -79,38 +79,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @add_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: add_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vadd.f16 s8, s4, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
-; CHECK-MVE-NEXT:    vmov r0, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
-; CHECK-MVE-NEXT:    vadd.f16 s8, s10, s8
-; CHECK-MVE-NEXT:    vadd.f16 s12, s5, s1
-; CHECK-MVE-NEXT:    vmov r1, s8
-; CHECK-MVE-NEXT:    vmov.16 q2[0], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vadd.f16 s12, s10, s8
+; CHECK-MVE-NEXT:    vadd.f16 s8, s4, s0
+; CHECK-MVE-NEXT:    vins.f16 s8, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vmov.16 q2[1], r1
+; CHECK-MVE-NEXT:    vadd.f16 s9, s5, s1
 ; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[2], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vadd.f16 s12, s6, s2
-; CHECK-MVE-NEXT:    vmov.16 q2[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vins.f16 s9, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
 ; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[4], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s7
-; CHECK-MVE-NEXT:    vadd.f16 s12, s7, s3
-; CHECK-MVE-NEXT:    vmov.16 q2[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vadd.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov.16 q2[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q2[7], r0
+; CHECK-MVE-NEXT:    vadd.f16 s10, s6, s2
+; CHECK-MVE-NEXT:    vins.f16 s10, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vadd.f16 s11, s7, s3
+; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vins.f16 s11, s12
 ; CHECK-MVE-NEXT:    vmov q0, q2
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -228,38 +216,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @sub_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: sub_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vsub.f16 s8, s4, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
-; CHECK-MVE-NEXT:    vmov r0, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
-; CHECK-MVE-NEXT:    vsub.f16 s8, s10, s8
-; CHECK-MVE-NEXT:    vsub.f16 s12, s5, s1
-; CHECK-MVE-NEXT:    vmov r1, s8
-; CHECK-MVE-NEXT:    vmov.16 q2[0], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vsub.f16 s12, s10, s8
+; CHECK-MVE-NEXT:    vsub.f16 s8, s4, s0
+; CHECK-MVE-NEXT:    vins.f16 s8, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vmov.16 q2[1], r1
+; CHECK-MVE-NEXT:    vsub.f16 s9, s5, s1
 ; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[2], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vsub.f16 s12, s6, s2
-; CHECK-MVE-NEXT:    vmov.16 q2[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vins.f16 s9, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vsub.f16 s10, s6, s2
+; CHECK-MVE-NEXT:    vins.f16 s10, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vsub.f16 s11, s7, s3
 ; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[4], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s7
-; CHECK-MVE-NEXT:    vsub.f16 s12, s7, s3
-; CHECK-MVE-NEXT:    vmov.16 q2[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vsub.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov.16 q2[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q2[7], r0
+; CHECK-MVE-NEXT:    vins.f16 s11, s12
 ; CHECK-MVE-NEXT:    vmov q0, q2
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -360,38 +336,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @mul_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: mul_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmul.f16 s8, s4, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
-; CHECK-MVE-NEXT:    vmov r0, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
-; CHECK-MVE-NEXT:    vmul.f16 s8, s10, s8
-; CHECK-MVE-NEXT:    vmul.f16 s12, s5, s1
-; CHECK-MVE-NEXT:    vmov r1, s8
-; CHECK-MVE-NEXT:    vmov.16 q2[0], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vmul.f16 s12, s10, s8
+; CHECK-MVE-NEXT:    vmul.f16 s8, s4, s0
+; CHECK-MVE-NEXT:    vins.f16 s8, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vmov.16 q2[1], r1
+; CHECK-MVE-NEXT:    vmul.f16 s9, s5, s1
 ; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[2], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vmul.f16 s12, s6, s2
-; CHECK-MVE-NEXT:    vmov.16 q2[3], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vins.f16 s9, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmul.f16 s10, s6, s2
+; CHECK-MVE-NEXT:    vins.f16 s10, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vmul.f16 s11, s7, s3
 ; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmov.16 q2[4], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s7
-; CHECK-MVE-NEXT:    vmul.f16 s12, s7, s3
-; CHECK-MVE-NEXT:    vmov.16 q2[5], r0
-; CHECK-MVE-NEXT:    vmov r0, s12
-; CHECK-MVE-NEXT:    vmul.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov.16 q2[6], r0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q2[7], r0
+; CHECK-MVE-NEXT:    vins.f16 s11, s12
 ; CHECK-MVE-NEXT:    vmov q0, q2
 ; CHECK-MVE-NEXT:    bx lr
 ;

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
index ad47c3a5546a..fb81aca1f7fa 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
@@ -769,102 +769,90 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it eq
-; CHECK-MVE-NEXT:    moveq r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -896,115 +884,103 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it gt
-; CHECK-MVE-NEXT:    movgt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1034,102 +1010,90 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it gt
-; CHECK-MVE-NEXT:    movgt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1158,102 +1122,90 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it ge
-; CHECK-MVE-NEXT:    movge r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1282,102 +1234,90 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1406,102 +1346,90 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it ls
-; CHECK-MVE-NEXT:    movls r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1533,115 +1461,103 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it eq
-; CHECK-MVE-NEXT:    moveq r2, #1
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1671,102 +1587,90 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it ne
-; CHECK-MVE-NEXT:    movne r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1795,102 +1699,90 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it hi
-; CHECK-MVE-NEXT:    movhi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1919,102 +1811,90 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it pl
-; CHECK-MVE-NEXT:    movpl r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -2043,102 +1923,90 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it lt
-; CHECK-MVE-NEXT:    movlt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -2167,102 +2035,90 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it le
-; CHECK-MVE-NEXT:    movle r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -2291,102 +2147,90 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it vc
-; CHECK-MVE-NEXT:    movvc r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr
@@ -2416,102 +2260,90 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s20
 ; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
 ; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
 ; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s20
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s20
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
index 4da2618ae944..5b4d3c5220d4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
@@ -811,98 +811,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it eq
-; CHECK-MVE-NEXT:    moveq r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -937,111 +925,99 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    it gt
-; CHECK-MVE-NEXT:    movgt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1074,98 +1050,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it gt
-; CHECK-MVE-NEXT:    movgt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1197,98 +1161,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it ge
-; CHECK-MVE-NEXT:    movge r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1320,98 +1272,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1443,98 +1383,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it ls
-; CHECK-MVE-NEXT:    movls r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1569,111 +1497,99 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
-; CHECK-MVE-NEXT:    moveq r2, #1
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    it eq
-; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1706,98 +1622,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it ne
-; CHECK-MVE-NEXT:    movne r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1829,98 +1733,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it hi
-; CHECK-MVE-NEXT:    movhi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1952,98 +1844,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it pl
-; CHECK-MVE-NEXT:    movpl r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -2075,98 +1955,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it lt
-; CHECK-MVE-NEXT:    movlt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -2198,98 +2066,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it le
-; CHECK-MVE-NEXT:    movle r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -2321,98 +2177,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it vc
-; CHECK-MVE-NEXT:    movvc r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -2445,98 +2289,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -3368,98 +3200,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it eq
-; CHECK-MVE-NEXT:    moveq r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -3494,111 +3314,99 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    it gt
-; CHECK-MVE-NEXT:    movgt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -3631,98 +3439,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it gt
-; CHECK-MVE-NEXT:    movgt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -3754,98 +3550,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it ge
-; CHECK-MVE-NEXT:    movge r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -3877,98 +3661,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4000,98 +3772,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it ls
-; CHECK-MVE-NEXT:    movls r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4126,111 +3886,99 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
-; CHECK-MVE-NEXT:    moveq r2, #1
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4263,98 +4011,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it ne
-; CHECK-MVE-NEXT:    movne r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4386,98 +4122,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it hi
-; CHECK-MVE-NEXT:    movhi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4509,98 +4233,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it pl
-; CHECK-MVE-NEXT:    movpl r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4632,98 +4344,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it lt
-; CHECK-MVE-NEXT:    movlt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4755,98 +4455,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it le
-; CHECK-MVE-NEXT:    movle r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4878,98 +4566,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it vc
-; CHECK-MVE-NEXT:    movvc r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -5002,98 +4678,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s4, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -5128,98 +4792,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s5, s12
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it eq
-; CHECK-MVE-NEXT:    moveq r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s12, s8
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
-; CHECK-MVE-NEXT:    vmov.16 q4[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q4[1], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s13, s9
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
-; CHECK-MVE-NEXT:    vmov.16 q4[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s14, s10
-; CHECK-MVE-NEXT:    vmov r1, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s17, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vmov.16 q4[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s15
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s15, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s6
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q4[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q4[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s19, s6
 ; CHECK-MVE-NEXT:    vmov q0, q4
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
index f6b8eb7df2ff..5021aa2f5fc9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
@@ -769,98 +769,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it eq
-; CHECK-MVE-NEXT:    moveq r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -892,111 +880,99 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    it gt
-; CHECK-MVE-NEXT:    movgt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1026,98 +1002,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it gt
-; CHECK-MVE-NEXT:    movgt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1146,98 +1110,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it ge
-; CHECK-MVE-NEXT:    movge r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1266,98 +1218,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1386,98 +1326,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it ls
-; CHECK-MVE-NEXT:    movls r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
-; CHECK-MVE-NEXT:    movls r0, #1
-; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1509,111 +1437,99 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it eq
-; CHECK-MVE-NEXT:    moveq r2, #1
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1643,98 +1559,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it ne
-; CHECK-MVE-NEXT:    movne r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1763,98 +1667,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it hi
-; CHECK-MVE-NEXT:    movhi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -1883,98 +1775,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it pl
-; CHECK-MVE-NEXT:    movpl r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -2003,98 +1883,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it lt
-; CHECK-MVE-NEXT:    movlt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -2123,98 +1991,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it le
-; CHECK-MVE-NEXT:    movle r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -2243,98 +2099,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it vc
-; CHECK-MVE-NEXT:    movvc r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -2364,98 +2208,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -3242,98 +3074,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it eq
-; CHECK-MVE-NEXT:    moveq r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -3365,111 +3185,99 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    it gt
-; CHECK-MVE-NEXT:    movgt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -3499,98 +3307,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it mi
-; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -3619,98 +3415,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it ls
-; CHECK-MVE-NEXT:    movls r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -3739,98 +3523,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it gt
-; CHECK-MVE-NEXT:    movgt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -3859,98 +3631,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it ge
-; CHECK-MVE-NEXT:    movge r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -3982,111 +3742,99 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it eq
-; CHECK-MVE-NEXT:    moveq r2, #1
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4116,98 +3864,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it ne
-; CHECK-MVE-NEXT:    movne r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
-; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4236,98 +3972,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it lt
-; CHECK-MVE-NEXT:    movlt r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4356,98 +4080,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it le
-; CHECK-MVE-NEXT:    movle r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4476,98 +4188,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it hi
-; CHECK-MVE-NEXT:    movhi r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4596,98 +4296,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it pl
-; CHECK-MVE-NEXT:    movpl r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4716,98 +4404,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it vc
-; CHECK-MVE-NEXT:    movvc r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
@@ -4837,98 +4513,86 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    it vs
-; CHECK-MVE-NEXT:    movvs r2, #1
-; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    cset r2, ne
-; CHECK-MVE-NEXT:    vmov r1, s12
-; CHECK-MVE-NEXT:    lsls r2, r2, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r2, s12
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
-; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
-; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s12, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s2, s2
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
-; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
-; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s13, s16
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    vmov r1, s16
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s14, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
 ; CHECK-MVE-NEXT:    lsls r1, r1, #31
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmov r1, s16
 ; CHECK-MVE-NEXT:    lsls r0, r0, #31
-; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
-; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s15, s16
 ; CHECK-MVE-NEXT:    vmov q0, q3
 ; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
index 93c8af4928fa..4e73779bec72 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
@@ -91,47 +91,35 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @foo_half_int16(<8 x i16> %src) {
 ; CHECK-MVE-LABEL: foo_half_int16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov.s16 r0, q0[0]
-; CHECK-MVE-NEXT:    vmov.s16 r1, q0[1]
-; CHECK-MVE-NEXT:    vmov s4, r0
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s4, s4
-; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vmov s4, r1
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s4, s4
-; CHECK-MVE-NEXT:    vmov r1, s4
-; CHECK-MVE-NEXT:    vmov.16 q1[0], r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q0[2]
-; CHECK-MVE-NEXT:    vmov.16 q1[1], r1
-; CHECK-MVE-NEXT:    vmov s8, r0
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[2], r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q0[3]
-; CHECK-MVE-NEXT:    vmov s8, r0
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[3], r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q0[4]
+; CHECK-MVE-NEXT:    vmov q1, q0
+; CHECK-MVE-NEXT:    vmov.s16 r0, q0[1]
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[0]
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s0
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s0, s0
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[3]
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[2]
 ; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[4], r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q0[5]
+; CHECK-MVE-NEXT:    vmov s10, r0
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[5]
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s1, s10
+; CHECK-MVE-NEXT:    vins.f16 s1, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[4]
 ; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[5], r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q0[6]
+; CHECK-MVE-NEXT:    vmov s10, r0
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[7]
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s2, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[6]
 ; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[6], r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q0[7]
-; CHECK-MVE-NEXT:    vmov s0, r0
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s0, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vmov s4, r0
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s3, s4
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: foo_half_int16:
@@ -146,47 +134,35 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @foo_half_uint16(<8 x i16> %src) {
 ; CHECK-MVE-LABEL: foo_half_uint16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-MVE-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-MVE-NEXT:    vmov s4, r0
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s4, s4
-; CHECK-MVE-NEXT:    vmov r0, s4
-; CHECK-MVE-NEXT:    vmov s4, r1
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s4, s4
-; CHECK-MVE-NEXT:    vmov r1, s4
-; CHECK-MVE-NEXT:    vmov.16 q1[0], r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-MVE-NEXT:    vmov.16 q1[1], r1
-; CHECK-MVE-NEXT:    vmov s8, r0
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[2], r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-MVE-NEXT:    vmov s8, r0
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[3], r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-MVE-NEXT:    vmov q1, q0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s0
+; CHECK-MVE-NEXT:    vmov s0, r0
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[2]
 ; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[4], r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-MVE-NEXT:    vmov s10, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s1, s10
+; CHECK-MVE-NEXT:    vins.f16 s1, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[4]
 ; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[5], r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-MVE-NEXT:    vmov s10, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s2, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[6]
 ; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov.16 q1[6], r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-MVE-NEXT:    vmov s0, r0
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s0, s0
-; CHECK-MVE-NEXT:    vmov r0, s0
-; CHECK-MVE-NEXT:    vmov.16 q1[7], r0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vmov s4, r0
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s3, s4
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: foo_half_uint16:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
index e88a5704db02..b6ef16cb3e05 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
@@ -205,41 +205,33 @@ entry:
 define void @vld2_v8i16_align1(<16 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld2_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.u8 q1, [r0]
-; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vldrb.u8 q2, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d2, d4
 ; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[3]
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.16 q0[3], r2
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
 ; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vins.f16 s4, s9
 ; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmov.f32 s5, s10
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vins.f16 s5, s11
 ; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vins.f16 s6, s1
 ; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.f32 s7, s2
 ; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vins.f16 s7, s3
 ; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vadd.i16 q0, q3, q0
+; CHECK-NEXT:    vadd.i16 q0, q1, q3
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -559,16 +551,10 @@ define void @vld2_v2f16(<4 x half> *%src, <2 x half> *%dst) {
 ; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.32 q0[1], r0
 ; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.16 q1[0], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vadd.f16 q0, q0, q1
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vadd.f16 q0, q0, q2
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    bx lr
@@ -585,27 +571,16 @@ define void @vld2_v4f16(<8 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vld2_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u16 q0, [r0]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q1[0], r2
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmovx.f16 s12, s2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q2[1], r2
-; CHECK-NEXT:    vmovx.f16 s12, s3
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vadd.f16 q0, q1, q2
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vmovx.f16 s5, s2
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vins.f16 s5, s8
+; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    strd r0, r2, [r1]
@@ -662,49 +637,29 @@ define void @vld2_v8f16_align1(<16 x half> *%src, <8 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8}
 ; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    vldrb.u8 q2, [r0]
+; CHECK-NEXT:    vldrb.u8 q3, [r0]
 ; CHECK-NEXT:    vldrb.u8 q1, [r0, #16]
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmovx.f16 s12, s8
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    vmov.16 q0[1], r3
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    vmov.16 q0[3], r2
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s9
-; CHECK-NEXT:    vmovx.f16 s16, s10
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.16 q3[1], r2
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmovx.f16 s8, s11
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s7
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vadd.f16 q0, q0, q3
+; CHECK-NEXT:    vmov.f64 d0, d6
+; CHECK-NEXT:    vmovx.f16 s16, s13
+; CHECK-NEXT:    vmovx.f16 s8, s12
+; CHECK-NEXT:    vins.f16 s8, s16
+; CHECK-NEXT:    vmovx.f16 s16, s15
+; CHECK-NEXT:    vmovx.f16 s9, s14
+; CHECK-NEXT:    vmovx.f16 s12, s5
+; CHECK-NEXT:    vins.f16 s0, s13
+; CHECK-NEXT:    vins.f16 s9, s16
+; CHECK-NEXT:    vmov.f32 s1, s14
+; CHECK-NEXT:    vmovx.f16 s10, s4
+; CHECK-NEXT:    vins.f16 s1, s15
+; CHECK-NEXT:    vins.f16 s10, s12
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vmovx.f16 s11, s6
+; CHECK-NEXT:    vins.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s6, s7
+; CHECK-NEXT:    vins.f16 s11, s12
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vadd.f16 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index 22ba78a9cbcb..3795f96766b8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -330,73 +330,67 @@ define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
 ; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
 ; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.16 q4[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[3]
-; CHECK-NEXT:    vmov.f32 s22, s8
-; CHECK-NEXT:    vmov.16 q4[3], r2
-; CHECK-NEXT:    vmov q3, q5
 ; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmovnb.i32 q3, q4
+; CHECK-NEXT:    vmov.f32 s22, s0
+; CHECK-NEXT:    vmov q3, q5
 ; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmovnb.i32 q3, q4
 ; CHECK-NEXT:    vmov r0, s14
 ; CHECK-NEXT:    vmov q3[2], q3[0], r2, r0
 ; CHECK-NEXT:    vmov r0, s23
 ; CHECK-NEXT:    vmov r2, s17
 ; CHECK-NEXT:    vmov q3[3], q3[1], r2, r0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
 ; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
 ; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
 ; CHECK-NEXT:    vmov.16 q4[2], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-NEXT:    vmov.16 q4[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[4]
 ; CHECK-NEXT:    vmov.16 q4[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
 ; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
 ; CHECK-NEXT:    vmov.16 q5[7], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[7]
 ; CHECK-NEXT:    vmov.16 q4[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.f32 s19, s23
-; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q5[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmovx.f16 s20, s8
+; CHECK-NEXT:    vins.f16 s20, s10
+; CHECK-NEXT:    vmovx.f16 s21, s11
+; CHECK-NEXT:    vins.f16 s21, s5
 ; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[6]
-; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.16 q1[5], r0
 ; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovnb.i32 q1, q5
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r0
-; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmovx.f16 s7, s1
+; CHECK-NEXT:    vins.f16 s7, s3
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmovnb.i32 q0, q5
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov q1[3], q1[1], r2, r0
-; CHECK-NEXT:    vadd.i16 q0, q4, q1
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r0
+; CHECK-NEXT:    vadd.i16 q0, q4, q0
 ; CHECK-NEXT:    vadd.i16 q0, q0, q3
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
@@ -417,144 +411,132 @@ define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
-; CHECK-NEXT:    vmov.16 q4[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[3]
-; CHECK-NEXT:    vmov.16 q4[3], r2
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.u16 r2, q3[0]
+; CHECK-NEXT:    vmov.16 q1[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[3]
+; CHECK-NEXT:    vmov.16 q1[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[6]
+; CHECK-NEXT:    vmov.16 q1[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[1]
+; CHECK-NEXT:    vmov.16 q1[3], r2
 ; CHECK-NEXT:    vmov.u16 r2, q2[4]
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[2]
 ; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.u16 r2, q4[5]
 ; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmov.f32 s22, s8
-; CHECK-NEXT:    vmov q3, q5
+; CHECK-NEXT:    vmov.u16 r2, q2[7]
+; CHECK-NEXT:    vmov.16 q1[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[4]
+; CHECK-NEXT:    vmov.16 q6[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[2]
+; CHECK-NEXT:    vmov.f32 s7, s23
+; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q3[5]
+; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov.16 q5[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[7]
+; CHECK-NEXT:    vmov.16 q6[7], r2
+; CHECK-NEXT:    vmov.f32 s22, s11
+; CHECK-NEXT:    vmov.f32 s26, s16
+; CHECK-NEXT:    vmov q7, q6
+; CHECK-NEXT:    vmov r3, s20
+; CHECK-NEXT:    vmovnb.i32 q7, q5
+; CHECK-NEXT:    vmov r12, s27
+; CHECK-NEXT:    vmov r2, s30
+; CHECK-NEXT:    vmov q6[2], q6[0], r3, r2
+; CHECK-NEXT:    vmov r2, s21
+; CHECK-NEXT:    vmov q6[3], q6[1], r2, r12
+; CHECK-NEXT:    vmov.u16 r2, q4[0]
+; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vmovx.f16 s23, s17
+; CHECK-NEXT:    vins.f16 s23, s19
+; CHECK-NEXT:    vmovx.f16 s16, s12
+; CHECK-NEXT:    vins.f16 s16, s14
+; CHECK-NEXT:    vmovx.f16 s17, s15
+; CHECK-NEXT:    vins.f16 s17, s9
+; CHECK-NEXT:    vmov q2, q5
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vmovnb.i32 q2, q4
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmovnb.i32 q3, q4
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov q3[2], q3[0], r3, r2
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
 ; CHECK-NEXT:    vmov r2, s23
 ; CHECK-NEXT:    vmov r3, s17
-; CHECK-NEXT:    vmov q3[3], q3[1], r3, r2
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
+; CHECK-NEXT:    vmov q2[3], q2[1], r3, r2
+; CHECK-NEXT:    vadd.i16 q1, q1, q2
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.u16 r0, q5[4]
+; CHECK-NEXT:    vadd.i16 q1, q1, q6
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.16 q3[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vmov.16 q3[1], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov.u16 r0, q5[7]
+; CHECK-NEXT:    vmov.16 q3[2], r2
+; CHECK-NEXT:    vmov.16 q6[7], r0
 ; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.16 q3[3], r2
+; CHECK-NEXT:    vmov.f32 s26, s20
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov q4, q6
+; CHECK-NEXT:    vmovnb.i32 q4, q3
+; CHECK-NEXT:    vmov.u16 r2, q2[0]
+; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov.u16 r2, q2[3]
+; CHECK-NEXT:    vmov r3, s27
 ; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
+; CHECK-NEXT:    vmov.u16 r2, q2[6]
 ; CHECK-NEXT:    vmov.16 q4[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[1]
-; CHECK-NEXT:    vmov.16 q4[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov.16 q4[4], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[2]
-; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.16 q4[5], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[1]
-; CHECK-NEXT:    vmov.f32 s19, s23
-; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.16 q4[3], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
-; CHECK-NEXT:    vmov.16 q5[1], r2
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[2]
+; CHECK-NEXT:    vmov.16 q7[6], r2
+; CHECK-NEXT:    vmov.u16 r2, q5[5]
+; CHECK-NEXT:    vmov.16 q7[7], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vmov.16 q5[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov.16 q5[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.16 q0[5], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[6]
-; CHECK-NEXT:    vmov.16 q0[7], r2
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmovnb.i32 q1, q5
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vmov r3, s21
-; CHECK-NEXT:    vmov q1[3], q1[1], r3, r2
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vadd.i16 q0, q4, q1
-; CHECK-NEXT:    vmov.16 q1[0], r2
-; CHECK-NEXT:    vadd.i16 q0, q0, q3
-; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.16 q1[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[6]
-; CHECK-NEXT:    vmov.16 q1[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[1]
-; CHECK-NEXT:    vmov.u16 r0, q4[2]
-; CHECK-NEXT:    vmov.16 q1[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[4]
-; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[5]
-; CHECK-NEXT:    vmov.16 q1[4], r2
-; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.f32 s7, s23
-; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[4]
-; CHECK-NEXT:    vmov.16 q7[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[7]
-; CHECK-NEXT:    vmov.16 q7[7], r0
-; CHECK-NEXT:    vmov.f32 s22, s15
-; CHECK-NEXT:    vmov.f32 s30, s16
-; CHECK-NEXT:    vmov q6, q7
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmovnb.i32 q6, q5
-; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vmov.16 q4[5], r2
+; CHECK-NEXT:    vmov r2, s12
 ; CHECK-NEXT:    vmov q6[2], q6[0], r2, r0
-; CHECK-NEXT:    vmov r0, s31
+; CHECK-NEXT:    vmov r0, s13
+; CHECK-NEXT:    vmov q6[3], q6[1], r0, r3
+; CHECK-NEXT:    vmov.u16 r0, q5[0]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmovx.f16 s15, s21
+; CHECK-NEXT:    vmov.f32 s19, s31
+; CHECK-NEXT:    vins.f16 s15, s23
+; CHECK-NEXT:    vmovx.f16 s20, s8
+; CHECK-NEXT:    vins.f16 s20, s10
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmovx.f16 s21, s11
+; CHECK-NEXT:    vins.f16 s21, s1
+; CHECK-NEXT:    vmov q0, q3
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmovnb.i32 q0, q5
+; CHECK-NEXT:    vmov r2, s20
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov r0, s15
 ; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov q6[3], q6[1], r2, r0
-; CHECK-NEXT:    vmov.u16 r0, q4[0]
-; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[3]
-; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.16 q7[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.16 q7[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q7[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.16 q7[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.16 q7[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[6]
-; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmov q2, q5
-; CHECK-NEXT:    vmovnb.i32 q2, q7
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r0
-; CHECK-NEXT:    vmov r0, s23
-; CHECK-NEXT:    vmov r2, s29
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
-; CHECK-NEXT:    vadd.i16 q1, q1, q2
-; CHECK-NEXT:    vadd.i16 q1, q1, q6
-; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r0
+; CHECK-NEXT:    vadd.i16 q0, q4, q0
+; CHECK-NEXT:    vadd.i16 q0, q0, q6
+; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -653,56 +635,54 @@ entry:
 define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) {
 ; CHECK-LABEL: vld3_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrb.u16 q1, [r0, #16]
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    vmov.16 q2[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    vmov.16 q2[2], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    vmov.16 q2[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    vmov.16 q2[4], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
-; CHECK-NEXT:    vmov.16 q2[5], r2
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrb.u16 q0, [r0, #16]
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q2[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q2[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
 ; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
 ; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
 ; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
 ; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmovx.f16 s11, s1
 ; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vins.f16 s11, s3
 ; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vadd.i16 q2, q2, q3
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vadd.i16 q2, q3, q2
 ; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
 ; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
 ; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
 ; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
 ; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
 ; CHECK-NEXT:    vmov.16 q3[7], r0
 ; CHECK-NEXT:    vadd.i16 q0, q2, q3
 ; CHECK-NEXT:    vstrb.16 q0, [r1]
@@ -1229,21 +1209,13 @@ define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) {
 ; CHECK-NEXT:    vmov.32 q0[1], r3
 ; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q1[1], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vadd.f16 q1, q2, q1
+; CHECK-NEXT:    vins.f16 s4, s2
+; CHECK-NEXT:    vins.f16 s0, s8
 ; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vadd.f16 q1, q0, q1
+; CHECK-NEXT:    vins.f16 s1, s8
+; CHECK-NEXT:    vmov.f32 s0, s1
 ; CHECK-NEXT:    vadd.f16 q0, q1, q0
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    str r0, [r1]
@@ -1262,48 +1234,35 @@ entry:
 define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vld3_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8}
-; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmovx.f16 s4, s3
-; CHECK-NEXT:    vmov.16 q2[0], r3
-; CHECK-NEXT:    vmovx.f16 s12, s1
-; CHECK-NEXT:    vmov.16 q2[1], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.16 q2[2], r2
 ; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmovx.f16 s0, s2
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmovx.f16 s16, s4
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q3[0], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmovx.f16 s4, s5
-; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vadd.f16 q2, q3, q2
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vadd.f16 q0, q2, q0
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vins.f16 s4, s2
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmovx.f16 s5, s3
+; CHECK-NEXT:    vmovx.f16 s12, s8
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vins.f16 s14, s12
+; CHECK-NEXT:    vmovx.f16 s12, s2
+; CHECK-NEXT:    vins.f16 s1, s12
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vins.f16 s5, s9
+; CHECK-NEXT:    vmov.f32 s17, s14
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vins.f16 s8, s12
+; CHECK-NEXT:    vadd.f16 q1, q4, q1
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vadd.f16 q0, q1, q0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    strd r0, r2, [r1]
-; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <12 x half>, <12 x half>* %src, align 4
@@ -1321,90 +1280,60 @@ define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmovx.f16 s0, s10
-; CHECK-NEXT:    vmov r3, s9
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q0[0], r3
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmovx.f16 s12, s19
-; CHECK-NEXT:    vmov.16 q0[3], r2
-; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmovx.f16 s4, s14
+; CHECK-NEXT:    vmov.f32 s0, s13
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmovx.f16 s20, s19
+; CHECK-NEXT:    vmovx.f16 s24, s13
+; CHECK-NEXT:    vmovx.f16 s8, s5
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vins.f16 s1, s8
+; CHECK-NEXT:    vmov.f32 s11, s18
+; CHECK-NEXT:    vins.f16 s11, s20
 ; CHECK-NEXT:    vmov.f32 s2, s7
 ; CHECK-NEXT:    vmovx.f16 s20, s16
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q3[6], r3
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmovx.f16 s24, s11
-; CHECK-NEXT:    vmov r5, s8
-; CHECK-NEXT:    vmov.f32 s14, s16
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov r0, s19
-; CHECK-NEXT:    vmov r12, s22
-; CHECK-NEXT:    vmovx.f16 s20, s17
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov lr, s23
-; CHECK-NEXT:    vmovx.f16 s20, s6
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.16 q5[5], r2
-; CHECK-NEXT:    vmov r3, s22
-; CHECK-NEXT:    vmovx.f16 s20, s8
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov.16 q5[0], r2
-; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov r0, s24
-; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmovx.f16 s20, s9
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmov r4, s20
-; CHECK-NEXT:    vmov.16 q5[0], r5
-; CHECK-NEXT:    vmov.16 q5[1], r4
-; CHECK-NEXT:    vmov r4, s11
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vmov.16 q5[2], r4
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s18
-; CHECK-NEXT:    vmov.16 q5[3], r4
-; CHECK-NEXT:    vmov r4, s6
-; CHECK-NEXT:    vmov.16 q5[4], r4
-; CHECK-NEXT:    vmov r4, s17
-; CHECK-NEXT:    vmovx.f16 s4, s7
-; CHECK-NEXT:    vmov r5, s8
-; CHECK-NEXT:    vmov.16 q2[6], r4
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov.16 q5[5], r4
+; CHECK-NEXT:    vmov.f32 s10, s16
+; CHECK-NEXT:    vins.f16 s2, s20
+; CHECK-NEXT:    vmov.f64 d10, d6
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vmovx.f16 s24, s4
+; CHECK-NEXT:    vmov.f32 s21, s15
+; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vmovx.f16 s24, s7
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vins.f16 s22, s24
+; CHECK-NEXT:    vmovx.f16 s24, s12
+; CHECK-NEXT:    vins.f16 s24, s14
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vmovx.f16 s25, s15
+; CHECK-NEXT:    vmovx.f16 s15, s17
+; CHECK-NEXT:    vins.f16 s15, s19
+; CHECK-NEXT:    vins.f16 s25, s5
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    vmovx.f16 s12, s18
+; CHECK-NEXT:    vmovx.f16 s6, s6
+; CHECK-NEXT:    vins.f16 s17, s12
+; CHECK-NEXT:    vins.f16 s6, s16
+; CHECK-NEXT:    vmov r5, s24
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov.16 q2[7], r5
 ; CHECK-NEXT:    vmov q1[2], q1[0], r4, r12
 ; CHECK-NEXT:    vmov r4, s1
-; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, lr
-; CHECK-NEXT:    vmov.f32 s23, s11
-; CHECK-NEXT:    vmov q1[3], q1[1], r4, r5
+; CHECK-NEXT:    vmov lr, s25
+; CHECK-NEXT:    vmov q0[2], q0[0], r5, r2
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    vmov q0[3], q0[1], lr, r0
+; CHECK-NEXT:    vmov.f32 s23, s17
+; CHECK-NEXT:    vmov q1[3], q1[1], r4, r3
 ; CHECK-NEXT:    vadd.f16 q0, q5, q0
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %l1 = load <24 x half>, <24 x half>* %src, align 4
@@ -1422,171 +1351,111 @@ define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
-; CHECK-NEXT:    vmovx.f16 s0, s11
-; CHECK-NEXT:    vmovx.f16 s4, s8
-; CHECK-NEXT:    vmov r2, s10
-; CHECK-NEXT:    vmovx.f16 s20, s17
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov r12, s4
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
 ; CHECK-NEXT:    vmovx.f16 s4, s14
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmov.16 q0[7], r3
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.16 q1[0], r2
-; CHECK-NEXT:    vmov.16 q1[1], r3
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov.16 q1[2], r2
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmov.16 q1[3], r2
+; CHECK-NEXT:    vmov.f32 s0, s13
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vmovx.f16 s20, s19
 ; CHECK-NEXT:    vmovx.f16 s24, s13
-; CHECK-NEXT:    vmov.f32 s6, s19
-; CHECK-NEXT:    vmovx.f16 s28, s16
-; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov r4, s14
-; CHECK-NEXT:    vmov r5, s4
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov.16 q5[5], r12
-; CHECK-NEXT:    vmov r12, s22
-; CHECK-NEXT:    vmovx.f16 s20, s10
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov.16 q5[7], r3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov r2, s24
-; CHECK-NEXT:    vmov.16 q6[0], r3
-; CHECK-NEXT:    vmov.16 q6[1], r2
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    vmov.16 q6[2], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmov.16 q6[3], r2
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmovx.f16 s28, s19
-; CHECK-NEXT:    vmov.16 q6[4], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmovx.f16 s28, s18
-; CHECK-NEXT:    vmov.16 q6[5], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov.16 q7[4], r2
-; CHECK-NEXT:    vmovx.f16 s8, s9
-; CHECK-NEXT:    vmov.16 q7[5], r3
-; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmovx.f16 s8, s5
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vins.f16 s1, s8
+; CHECK-NEXT:    vmov.f32 s11, s18
+; CHECK-NEXT:    vins.f16 s11, s20
+; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vmovx.f16 s20, s16
+; CHECK-NEXT:    vmov.f32 s10, s16
+; CHECK-NEXT:    vins.f16 s2, s20
+; CHECK-NEXT:    vmov.f64 d10, d6
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vmovx.f16 s24, s4
+; CHECK-NEXT:    vmov.f32 s21, s15
+; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vmovx.f16 s24, s7
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vins.f16 s22, s24
+; CHECK-NEXT:    vmovx.f16 s24, s12
+; CHECK-NEXT:    vins.f16 s24, s14
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vmovx.f16 s25, s15
+; CHECK-NEXT:    vmovx.f16 s15, s17
+; CHECK-NEXT:    vins.f16 s25, s5
+; CHECK-NEXT:    vmov r6, s24
+; CHECK-NEXT:    vmov lr, s25
+; CHECK-NEXT:    vmovx.f16 s24, s18
+; CHECK-NEXT:    vmovx.f16 s6, s6
+; CHECK-NEXT:    vins.f16 s17, s24
+; CHECK-NEXT:    vins.f16 s6, s16
+; CHECK-NEXT:    vins.f16 s15, s19
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vmov q1[2], q1[0], r5, r12
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    vmov r4, s15
+; CHECK-NEXT:    vmov q0[2], q0[0], r6, r3
 ; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    vmov.16 q2[6], r3
-; CHECK-NEXT:    vmov.16 q2[7], r2
-; CHECK-NEXT:    vmov lr, s30
-; CHECK-NEXT:    vmov r6, s11
-; CHECK-NEXT:    vmovx.f16 s8, s12
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmovx.f16 s12, s15
-; CHECK-NEXT:    vmov.16 q2[0], r3
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov.16 q2[1], r4
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmov.16 q2[2], r3
-; CHECK-NEXT:    vmov r3, s17
-; CHECK-NEXT:    vmov.16 q2[3], r3
-; CHECK-NEXT:    vmov.f32 s27, s23
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov r4, s9
-; CHECK-NEXT:    vmov q2[2], q2[0], r5, r12
-; CHECK-NEXT:    vmov r5, s3
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, lr
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r6
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r5
-; CHECK-NEXT:    vadd.f16 q0, q6, q0
-; CHECK-NEXT:    vmovx.f16 s12, s16
-; CHECK-NEXT:    vadd.f16 q1, q0, q2
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov q0[3], q0[1], lr, r4
+; CHECK-NEXT:    vmov.f32 s23, s17
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vmov q1[3], q1[1], r5, r2
+; CHECK-NEXT:    vadd.f16 q0, q5, q0
+; CHECK-NEXT:    vadd.f16 q1, q0, q1
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s4, s19
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.16 q1[7], r2
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s10
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov r3, s12
-; CHECK-NEXT:    vmov.16 q3[0], r2
-; CHECK-NEXT:    vmov.16 q3[1], r3
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s20, s1
-; CHECK-NEXT:    vmov.16 q3[2], r2
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmovx.f16 s24, s9
-; CHECK-NEXT:    vmov.16 q3[3], r2
-; CHECK-NEXT:    vmovx.f16 s28, s0
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s6, s16
-; CHECK-NEXT:    vmov r5, s10
-; CHECK-NEXT:    vmov r4, s12
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov r2, s17
-; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmovx.f16 s20, s18
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov.16 q5[7], r3
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r2, s24
-; CHECK-NEXT:    vmov.16 q6[0], r3
-; CHECK-NEXT:    vmov.16 q6[1], r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    vmov.16 q6[2], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmov.16 q6[3], r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmovx.f16 s28, s3
-; CHECK-NEXT:    vmov.16 q6[4], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmovx.f16 s28, s2
-; CHECK-NEXT:    vmov.16 q6[5], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmovx.f16 s16, s17
-; CHECK-NEXT:    vmov.16 q7[4], r2
-; CHECK-NEXT:    vmov r6, s16
-; CHECK-NEXT:    vmov.16 q7[5], r3
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    vmov.16 q4[6], r6
-; CHECK-NEXT:    vmov r2, s30
-; CHECK-NEXT:    vmov.16 q4[7], r3
-; CHECK-NEXT:    vmov.f32 s27, s23
-; CHECK-NEXT:    vmov r3, s19
-; CHECK-NEXT:    vmovx.f16 s16, s8
-; CHECK-NEXT:    vmov r6, s16
-; CHECK-NEXT:    vmovx.f16 s8, s11
-; CHECK-NEXT:    vmov.16 q4[0], r6
-; CHECK-NEXT:    vmov r6, s8
-; CHECK-NEXT:    vmov.16 q4[1], r5
-; CHECK-NEXT:    vmov.16 q4[2], r6
-; CHECK-NEXT:    vmov r6, s1
-; CHECK-NEXT:    vmov.16 q4[3], r6
+; CHECK-NEXT:    vmovx.f16 s12, s18
+; CHECK-NEXT:    vmov.f32 s4, s17
+; CHECK-NEXT:    vins.f16 s4, s12
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmov.f32 s5, s8
+; CHECK-NEXT:    vmovx.f16 s20, s3
+; CHECK-NEXT:    vins.f16 s5, s12
+; CHECK-NEXT:    vmov.f32 s15, s2
+; CHECK-NEXT:    vins.f16 s15, s20
+; CHECK-NEXT:    vmov.f32 s6, s11
+; CHECK-NEXT:    vmovx.f16 s20, s0
+; CHECK-NEXT:    vmovx.f16 s24, s17
+; CHECK-NEXT:    vins.f16 s6, s20
+; CHECK-NEXT:    vmov.f64 d10, d8
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vmovx.f16 s24, s8
+; CHECK-NEXT:    vmov.f32 s21, s19
+; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vmovx.f16 s24, s11
+; CHECK-NEXT:    vmov.f32 s22, s10
+; CHECK-NEXT:    vins.f16 s22, s24
+; CHECK-NEXT:    vmovx.f16 s24, s16
+; CHECK-NEXT:    vins.f16 s24, s18
+; CHECK-NEXT:    vmov.f32 s14, s0
+; CHECK-NEXT:    vmovx.f16 s25, s19
+; CHECK-NEXT:    vmovx.f16 s19, s1
+; CHECK-NEXT:    vins.f16 s25, s9
+; CHECK-NEXT:    vmov r3, s24
+; CHECK-NEXT:    vmov r2, s25
+; CHECK-NEXT:    vmovx.f16 s24, s2
+; CHECK-NEXT:    vmovx.f16 s10, s10
+; CHECK-NEXT:    vins.f16 s1, s24
+; CHECK-NEXT:    vins.f16 s10, s0
+; CHECK-NEXT:    vins.f16 s19, s3
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov r6, s10
+; CHECK-NEXT:    vmov r4, s4
+; CHECK-NEXT:    vmov.f32 s23, s1
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
-; CHECK-NEXT:    vmov r6, s16
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov q1[2], q1[0], r6, r2
-; CHECK-NEXT:    vmov r5, s17
-; CHECK-NEXT:    vmov q1[3], q1[1], r5, r3
-; CHECK-NEXT:    vmov r4, s13
-; CHECK-NEXT:    vmov q0[3], q0[1], r4, r0
-; CHECK-NEXT:    vadd.f16 q1, q6, q1
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov q1[2], q1[0], r3, r6
+; CHECK-NEXT:    vmov r5, s19
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r5
+; CHECK-NEXT:    vmov r4, s15
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r4
+; CHECK-NEXT:    vadd.f16 q1, q5, q1
 ; CHECK-NEXT:    vadd.f16 q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %l1 = load <48 x half>, <48 x half>* %src, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index 20b46e1a13b1..66f942ad981a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -390,84 +390,64 @@ define void @vld4_v8i16_align1(<32 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrb.u8 q1, [r0]
-; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
-; CHECK-NEXT:    vldrb.u8 q2, [r0, #32]
-; CHECK-NEXT:    vldrb.u8 q3, [r0, #48]
-; CHECK-NEXT:    vmov.u16 r2, q1[3]
-; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov.16 q4[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[7]
-; CHECK-NEXT:    vmov.16 q5[5], r2
+; CHECK-NEXT:    vldrb.u8 q3, [r0]
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
+; CHECK-NEXT:    vldrb.u8 q1, [r0, #48]
+; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
 ; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
-; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.f32 s18, s22
-; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.f32 s18, s1
 ; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
 ; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
 ; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vins.f16 s18, s3
 ; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.f32 s19, s5
 ; CHECK-NEXT:    vmov.16 q6[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vins.f16 s19, s7
 ; CHECK-NEXT:    vmov.16 q6[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.f32 s16, s13
 ; CHECK-NEXT:    vmov.16 q6[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
 ; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.f32 s22, s26
-; CHECK-NEXT:    vmov.f32 s23, s27
-; CHECK-NEXT:    vadd.i16 q4, q5, q4
-; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q6[5], r0
+; CHECK-NEXT:    vins.f16 s16, s15
+; CHECK-NEXT:    vmov.f32 s17, s9
 ; CHECK-NEXT:    vmov.u16 r0, q3[1]
-; CHECK-NEXT:    vmov.16 q6[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.16 q6[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
 ; CHECK-NEXT:    vmov.f32 s22, s26
+; CHECK-NEXT:    vins.f16 s17, s11
 ; CHECK-NEXT:    vmov.f32 s23, s27
 ; CHECK-NEXT:    vmov.16 q6[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vadd.i16 q4, q4, q5
+; CHECK-NEXT:    vmov.f64 d11, d0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
 ; CHECK-NEXT:    vmov.16 q6[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
 ; CHECK-NEXT:    vmov.16 q6[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vins.f16 s22, s2
+; CHECK-NEXT:    vmov.f32 s23, s4
+; CHECK-NEXT:    vins.f16 s23, s6
+; CHECK-NEXT:    vmov.f32 s20, s12
+; CHECK-NEXT:    vins.f16 s20, s14
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.f32 s21, s8
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
 ; CHECK-NEXT:    vmov.16 q6[3], r0
-; CHECK-NEXT:    vmov.f32 s26, s6
-; CHECK-NEXT:    vmov.f32 s27, s7
-; CHECK-NEXT:    vadd.i16 q0, q6, q5
+; CHECK-NEXT:    vins.f16 s21, s10
+; CHECK-NEXT:    vmov.f32 s26, s14
+; CHECK-NEXT:    vmov.f32 s27, s15
+; CHECK-NEXT:    vadd.i16 q0, q5, q6
 ; CHECK-NEXT:    vadd.i16 q0, q0, q4
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
@@ -1099,27 +1079,16 @@ define void @vld4_v2f16(<8 x half> *%src, <2 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-NEXT:    vmovx.f16 s4, s3
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.16 q1[0], r2
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vadd.f16 q1, q2, q1
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vmov.f32 s4, s1
+; CHECK-NEXT:    vmovx.f16 s12, s0
+; CHECK-NEXT:    vadd.f16 q1, q1, q2
 ; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[1], r2
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vadd.f16 q0, q0, q2
+; CHECK-NEXT:    vins.f16 s12, s8
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vadd.f16 q0, q0, q3
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    str r0, [r1]
@@ -1142,50 +1111,29 @@ define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8}
 ; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    vldrh.u16 q1, [r0]
-; CHECK-NEXT:    vldrh.u16 q0, [r0, #16]
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vmov r3, s7
-; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmov.16 q2[1], r3
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s7
-; CHECK-NEXT:    vmovx.f16 s16, s1
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov.16 q3[1], r2
-; CHECK-NEXT:    vmovx.f16 s16, s3
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmovx.f16 s16, s0
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vadd.f16 q2, q2, q3
-; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s6
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.16 q3[1], r2
-; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    vldrh.u16 q0, [r0]
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vmovx.f16 s4, s3
+; CHECK-NEXT:    vmov.f32 s12, s1
+; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vldrh.u16 q1, [r0, #16]
+; CHECK-NEXT:    vmovx.f16 s16, s7
+; CHECK-NEXT:    vmovx.f16 s9, s5
+; CHECK-NEXT:    vins.f16 s5, s7
+; CHECK-NEXT:    vins.f16 s9, s16
+; CHECK-NEXT:    vmov.f32 s13, s5
 ; CHECK-NEXT:    vmovx.f16 s16, s2
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.16 q1[1], r2
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vadd.f16 q0, q1, q3
+; CHECK-NEXT:    vadd.f16 q2, q3, q2
+; CHECK-NEXT:    vmovx.f16 s12, s0
+; CHECK-NEXT:    vins.f16 s12, s16
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s16, s6
+; CHECK-NEXT:    vmovx.f16 s13, s4
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s13, s16
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vadd.f16 q0, q0, q3
 ; CHECK-NEXT:    vadd.f16 q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r0, s0
@@ -1285,117 +1233,58 @@ entry:
 define void @vld4_v8f16_align1(<32 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld4_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #64
-; CHECK-NEXT:    sub sp, #64
-; CHECK-NEXT:    vldrb.u8 q6, [r0]
-; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
-; CHECK-NEXT:    vldrb.u8 q4, [r0, #32]
-; CHECK-NEXT:    vldrb.u8 q5, [r0, #48]
-; CHECK-NEXT:    vmov r2, s24
-; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov r3, s26
-; CHECK-NEXT:    vmov.16 q0[1], r3
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmovx.f16 s0, s19
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s0, s17
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmovx.f16 s0, s21
-; CHECK-NEXT:    vmov.16 q1[4], r3
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[5], r2
-; CHECK-NEXT:    vmovx.f16 s0, s23
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmovx.f16 s0, s25
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmovx.f16 s0, s27
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmovx.f16 s0, s9
-; CHECK-NEXT:    vmov.16 q3[1], r2
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmovx.f16 s0, s11
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r2, s19
-; CHECK-NEXT:    vmov.16 q0[5], r2
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov r0, s23
-; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vmov r0, s25
-; CHECK-NEXT:    vmov.16 q7[0], r0
-; CHECK-NEXT:    vmov r2, s27
-; CHECK-NEXT:    vmov.16 q7[1], r2
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q7[2], r0
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmovx.f16 s0, s16
-; CHECK-NEXT:    vmov.16 q7[3], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmovx.f16 s0, s18
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmovx.f16 s8, s20
-; CHECK-NEXT:    vmov.16 q1[5], r2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s22
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s24
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s26
-; CHECK-NEXT:    vmov.16 q6[0], r0
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vmov.16 q6[1], r2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vmov.16 q6[2], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q6[3], r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmov r2, s18
-; CHECK-NEXT:    vmov.16 q4[4], r0
-; CHECK-NEXT:    vmov.16 q4[5], r2
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov.16 q4[6], r0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.f32 s26, s6
-; CHECK-NEXT:    vmov.f32 s30, s2
-; CHECK-NEXT:    vmov.f32 s31, s3
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vmov.f32 s14, s22
-; CHECK-NEXT:    vmov.f32 s2, s18
-; CHECK-NEXT:    vmov.f32 s15, s23
-; CHECK-NEXT:    vmov.f32 s27, s7
-; CHECK-NEXT:    vadd.f16 q3, q7, q3
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vadd.f16 q0, q0, q6
-; CHECK-NEXT:    vadd.f16 q0, q0, q3
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vldrb.u8 q2, [r0, #32]
+; CHECK-NEXT:    vldrb.u8 q3, [r0, #48]
+; CHECK-NEXT:    vmovx.f16 s4, s11
+; CHECK-NEXT:    vmovx.f16 s2, s9
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s15
+; CHECK-NEXT:    vmovx.f16 s3, s13
+; CHECK-NEXT:    vins.f16 s9, s11
+; CHECK-NEXT:    vins.f16 s3, s4
+; CHECK-NEXT:    vldrb.u8 q1, [r0]
+; CHECK-NEXT:    vmovx.f16 s28, s10
+; CHECK-NEXT:    vmovx.f16 s26, s8
+; CHECK-NEXT:    vmovx.f16 s16, s7
+; CHECK-NEXT:    vmovx.f16 s0, s5
+; CHECK-NEXT:    vins.f16 s0, s16
+; CHECK-NEXT:    vldrb.u8 q4, [r0, #16]
+; CHECK-NEXT:    vins.f16 s13, s15
+; CHECK-NEXT:    vins.f16 s26, s28
+; CHECK-NEXT:    vmovx.f16 s20, s19
+; CHECK-NEXT:    vmovx.f16 s1, s17
+; CHECK-NEXT:    vins.f16 s1, s20
+; CHECK-NEXT:    vmov.f32 s22, s9
+; CHECK-NEXT:    vins.f16 s8, s10
+; CHECK-NEXT:    vmov.f32 s23, s13
+; CHECK-NEXT:    vmovx.f16 s28, s14
+; CHECK-NEXT:    vmovx.f16 s27, s12
+; CHECK-NEXT:    vmov.f32 s10, s8
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vmov.f32 s11, s12
+; CHECK-NEXT:    vins.f16 s27, s28
+; CHECK-NEXT:    vins.f16 s5, s7
+; CHECK-NEXT:    vmovx.f16 s28, s6
+; CHECK-NEXT:    vmovx.f16 s24, s4
+; CHECK-NEXT:    vmov.f32 s20, s5
+; CHECK-NEXT:    vins.f16 s17, s19
+; CHECK-NEXT:    vins.f16 s24, s28
+; CHECK-NEXT:    vmov.f32 s21, s17
+; CHECK-NEXT:    vmovx.f16 s28, s18
+; CHECK-NEXT:    vmovx.f16 s25, s16
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s16, s18
+; CHECK-NEXT:    vins.f16 s25, s28
+; CHECK-NEXT:    vmov.f32 s5, s16
+; CHECK-NEXT:    vadd.f16 q0, q5, q0
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vadd.f16 q1, q1, q6
+; CHECK-NEXT:    vadd.f16 q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #64
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x half>, <32 x half>* %src, align 1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
index 7366284a7332..350b3b01472a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
@@ -4,234 +4,129 @@
 define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 {
 ; CHECK-LABEL: vldst4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #80
-; CHECK-NEXT:    sub sp, #80
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
 ; CHECK-NEXT:    mul r12, r3, r2
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    cmp.w r2, r12, lsr #2
 ; CHECK-NEXT:    beq.w .LBB0_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    mvn r3, #7
-; CHECK-NEXT:    ldr r5, [sp, #160]
+; CHECK-NEXT:    ldr r2, [sp, #96]
 ; CHECK-NEXT:    and.w r3, r3, r12, lsr #2
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q5, [r0, #32]
-; CHECK-NEXT:    vldrh.u16 q3, [r0, #48]
-; CHECK-NEXT:    vldrh.u16 q7, [r0], #64
-; CHECK-NEXT:    vmov r2, s20
-; CHECK-NEXT:    vmovx.f16 s8, s12
-; CHECK-NEXT:    vmov.16 q0[4], r2
-; CHECK-NEXT:    vmov r3, s22
-; CHECK-NEXT:    vmov.16 q0[5], r3
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vldrh.u16 q6, [r0, #-48]
-; CHECK-NEXT:    vmov.16 q1[0], r2
-; CHECK-NEXT:    vmov r3, s30
-; CHECK-NEXT:    vmov.16 q1[1], r3
-; CHECK-NEXT:    vmov r2, s24
-; CHECK-NEXT:    vmov.16 q1[2], r2
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov.16 q0[7], r2
-; CHECK-NEXT:    vmov r2, s26
-; CHECK-NEXT:    vmov.16 q1[3], r2
-; CHECK-NEXT:    vmov.f32 s6, s2
-; CHECK-NEXT:    vmov.f32 s7, s3
-; CHECK-NEXT:    vmul.f16 q0, q1, r5
-; CHECK-NEXT:    vmovx.f16 s4, s24
-; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmovx.f16 s0, s30
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s0, s28
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmov.16 q0[0], r4
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmovx.f16 s4, s22
-; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmovx.f16 s4, s20
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vmov.16 q1[4], r4
-; CHECK-NEXT:    vmov.16 q1[5], r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmovx.f16 s8, s14
-; CHECK-NEXT:    vmov.16 q1[6], r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmovx.f16 s8, s26
-; CHECK-NEXT:    vmov.16 q1[7], r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov.16 q0[3], r2
-; CHECK-NEXT:    vmovx.f16 s8, s13
-; CHECK-NEXT:    vmov.f32 s2, s6
-; CHECK-NEXT:    vmov.f32 s3, s7
-; CHECK-NEXT:    vmov.16 q1[0], r3
-; CHECK-NEXT:    vmul.f16 q0, q0, r5
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    vmov r2, s3
-; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmovx.f16 s0, s19
-; CHECK-NEXT:    vmov.16 q1[1], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q1[4], r2
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov.16 q0[4], r2
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmov.16 q0[5], r3
-; CHECK-NEXT:    vmov r3, s29
-; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov r2, s31
-; CHECK-NEXT:    vmov.16 q1[0], r3
-; CHECK-NEXT:    vmov.16 q1[1], r2
-; CHECK-NEXT:    vmov r2, s25
-; CHECK-NEXT:    vmov.16 q1[2], r2
-; CHECK-NEXT:    vmov r2, s15
-; CHECK-NEXT:    vmov.16 q0[7], r2
-; CHECK-NEXT:    vmov r2, s27
-; CHECK-NEXT:    vmov.16 q1[3], r2
-; CHECK-NEXT:    vmov.f32 s6, s2
-; CHECK-NEXT:    vmov.f32 s7, s3
-; CHECK-NEXT:    vmovx.f16 s0, s31
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s0, s29
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vmul.f16 q4, q1, r5
-; CHECK-NEXT:    vmov.16 q0[0], r4
-; CHECK-NEXT:    vmovx.f16 s4, s25
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vldrh.u16 q3, [r0, #32]
+; CHECK-NEXT:    vldrh.u16 q5, [r0, #48]
+; CHECK-NEXT:    vldrh.u16 q4, [r0], #64
+; CHECK-NEXT:    vmovx.f16 s4, s15
+; CHECK-NEXT:    vmovx.f16 s2, s13
+; CHECK-NEXT:    vins.f16 s2, s4
 ; CHECK-NEXT:    vmovx.f16 s4, s23
-; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmovx.f16 s4, s21
-; CHECK-NEXT:    vmov r4, s4
-; CHECK-NEXT:    vstrw.32 q4, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q1[4], r4
-; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vmov.16 q1[5], r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmovx.f16 s8, s15
-; CHECK-NEXT:    vmov.16 q1[6], r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmovx.f16 s8, s27
-; CHECK-NEXT:    vmov.16 q1[7], r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov.16 q0[3], r2
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s6
-; CHECK-NEXT:    vmov.f32 s3, s7
-; CHECK-NEXT:    vmov.16 q1[2], r3
-; CHECK-NEXT:    vmul.f16 q6, q0, r5
-; CHECK-NEXT:    vmovx.f16 s0, s16
-; CHECK-NEXT:    vmov r2, s24
-; CHECK-NEXT:    vmov.16 q1[3], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s0, s24
-; CHECK-NEXT:    vmov.16 q1[6], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s0, s8
-; CHECK-NEXT:    vmov.16 q1[7], r2
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q5[0], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vmov.16 q5[1], r3
-; CHECK-NEXT:    vmov r3, s25
-; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q5[5], r2
-; CHECK-NEXT:    vmov r2, s17
-; CHECK-NEXT:    vmov.16 q3[2], r2
+; CHECK-NEXT:    vmovx.f16 s3, s21
+; CHECK-NEXT:    vldrh.u16 q7, [r0, #-48]
+; CHECK-NEXT:    vins.f16 s3, s4
+; CHECK-NEXT:    vmovx.f16 s4, s19
 ; CHECK-NEXT:    vmovx.f16 s0, s17
-; CHECK-NEXT:    vmov.16 q3[3], r3
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s0, s25
-; CHECK-NEXT:    vmov.16 q3[6], r2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vins.f16 s13, s15
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s31
+; CHECK-NEXT:    vmovx.f16 s1, s29
+; CHECK-NEXT:    vins.f16 s21, s23
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vins.f16 s17, s19
+; CHECK-NEXT:    vmul.f16 q1, q0, r2
+; CHECK-NEXT:    vmov.f32 s2, s13
+; CHECK-NEXT:    vmov.f32 s3, s21
+; CHECK-NEXT:    vins.f16 s29, s31
+; CHECK-NEXT:    vmov.f32 s0, s17
+; CHECK-NEXT:    vmov.f32 s1, s29
+; CHECK-NEXT:    vmul.f16 q2, q0, r2
+; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s27, s8
+; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vmov.f32 s25, s8
+; CHECK-NEXT:    vmovx.f16 s4, s7
+; CHECK-NEXT:    vins.f16 s27, s0
+; CHECK-NEXT:    vmovx.f16 s2, s12
+; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmovx.f16 s24, s14
+; CHECK-NEXT:    vins.f16 s2, s24
+; CHECK-NEXT:    vmovx.f16 s24, s22
+; CHECK-NEXT:    vmovx.f16 s3, s20
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vins.f16 s3, s24
+; CHECK-NEXT:    vmovx.f16 s24, s18
+; CHECK-NEXT:    vmovx.f16 s0, s16
+; CHECK-NEXT:    vins.f16 s20, s22
+; CHECK-NEXT:    vins.f16 s0, s24
+; CHECK-NEXT:    vmovx.f16 s24, s30
+; CHECK-NEXT:    vmovx.f16 s1, s28
+; CHECK-NEXT:    vins.f16 s16, s18
+; CHECK-NEXT:    vins.f16 s1, s24
+; CHECK-NEXT:    vins.f16 s28, s30
+; CHECK-NEXT:    vmul.f16 q6, q0, r2
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vmov.f32 s3, s20
+; CHECK-NEXT:    vmov.f32 s17, s28
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s19, s3
+; CHECK-NEXT:    vmovx.f16 s2, s24
+; CHECK-NEXT:    vmul.f16 q5, q4, r2
+; CHECK-NEXT:    vmovx.f16 s0, s20
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmov q3, q5
+; CHECK-NEXT:    vmov.f32 s14, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s9
-; CHECK-NEXT:    vmov.16 q3[7], r2
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov.16 q7[0], r2
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    vmov.16 q7[1], r3
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s0, s5
-; CHECK-NEXT:    vmov.16 q7[4], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q7[5], r2
-; CHECK-NEXT:    vmov r3, s26
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmovx.f16 s0, s2
-; CHECK-NEXT:    vmov.16 q2[2], r2
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q2[3], r3
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vins.f16 s9, s5
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmov.f32 s19, s0
+; CHECK-NEXT:    vmovx.f16 s31, s10
+; CHECK-NEXT:    vmovx.f16 s18, s21
+; CHECK-NEXT:    vins.f16 s21, s25
+; CHECK-NEXT:    vins.f16 s10, s6
+; CHECK-NEXT:    vmov.f32 s16, s21
+; CHECK-NEXT:    vmovx.f16 s0, s25
+; CHECK-NEXT:    vmov.f32 s29, s10
+; CHECK-NEXT:    vins.f16 s18, s0
+; CHECK-NEXT:    vmovx.f16 s0, s6
+; CHECK-NEXT:    vins.f16 s31, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s26
-; CHECK-NEXT:    vmov.16 q2[6], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q2[7], r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmovx.f16 s4, s6
-; CHECK-NEXT:    vmov.16 q0[1], r3
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmovx.f16 s4, s18
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q0[4], r2
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.16 q0[5], r2
-; CHECK-NEXT:    vmov r2, s19
-; CHECK-NEXT:    vmov r3, s27
-; CHECK-NEXT:    vmov.16 q1[2], r2
-; CHECK-NEXT:    vmovx.f16 s16, s19
-; CHECK-NEXT:    vmov.16 q1[3], r3
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmovx.f16 s16, s27
-; CHECK-NEXT:    vmov.16 q1[6], r2
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q1[7], r2
-; CHECK-NEXT:    vmov.f32 s1, s9
-; CHECK-NEXT:    vldrw.u32 q6, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmovx.f16 s16, s19
-; CHECK-NEXT:    vmov.f32 s3, s11
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s21, s25
-; CHECK-NEXT:    vstrh.16 q0, [r1, #32]
-; CHECK-NEXT:    vmov.16 q4[5], r2
-; CHECK-NEXT:    vmov.f32 s29, s13
-; CHECK-NEXT:    vmov q2, q4
-; CHECK-NEXT:    vmov.f32 s23, s27
-; CHECK-NEXT:    vmov.f32 s9, s5
-; CHECK-NEXT:    vmov.f32 s11, s7
-; CHECK-NEXT:    vstrh.16 q2, [r1, #48]
-; CHECK-NEXT:    vstrh.16 q5, [r1], #64
-; CHECK-NEXT:    vmov.f32 s31, s15
-; CHECK-NEXT:    vstrh.16 q7, [r1, #-48]
+; CHECK-NEXT:    vmovx.f16 s30, s22
+; CHECK-NEXT:    vins.f16 s22, s26
+; CHECK-NEXT:    vmov.f32 s28, s22
+; CHECK-NEXT:    vins.f16 s30, s0
+; CHECK-NEXT:    vmovx.f16 s3, s11
+; CHECK-NEXT:    vins.f16 s11, s7
+; CHECK-NEXT:    vstrh.16 q7, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s1, s11
+; CHECK-NEXT:    vins.f16 s3, s4
+; CHECK-NEXT:    vmovx.f16 s4, s27
+; CHECK-NEXT:    vmovx.f16 s2, s23
+; CHECK-NEXT:    vins.f16 s23, s27
+; CHECK-NEXT:    vmov.f32 s0, s23
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrh.16 q0, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s13, s5
+; CHECK-NEXT:    vmov.f32 s15, s7
+; CHECK-NEXT:    vstrh.16 q3, [r1], #64
+; CHECK-NEXT:    vstrh.16 q4, [r1, #-48]
 ; CHECK-NEXT:    le lr, .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: @ %while.end
-; CHECK-NEXT:    add sp, #80
+; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r7, pc}
 entry:
   %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
   %l0 = bitcast i16 %tmp.0.extract.trunc to half

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
index bed6db9d54ff..1713798e62ea 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
@@ -393,45 +393,29 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: vmovn16_b2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[6]
-; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmovx.f16 s9, s5
+; CHECK-NEXT:    vins.f16 s9, s1
+; CHECK-NEXT:    vmovx.f16 s8, s4
+; CHECK-NEXT:    vins.f16 s8, s0
+; CHECK-NEXT:    vmovx.f16 s10, s6
+; CHECK-NEXT:    vins.f16 s10, s2
+; CHECK-NEXT:    vmovx.f16 s11, s7
+; CHECK-NEXT:    vins.f16 s11, s3
+; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: vmovn16_b2:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.16 q2, q1
-; CHECKBE-NEXT:    vrev64.16 q3, q0
-; CHECKBE-NEXT:    vmov.u16 r0, q2[1]
-; CHECKBE-NEXT:    vmov.16 q1[0], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q3[0]
-; CHECKBE-NEXT:    vmov.16 q1[1], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q2[3]
-; CHECKBE-NEXT:    vmov.16 q1[2], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q3[2]
-; CHECKBE-NEXT:    vmov.16 q1[3], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q2[5]
-; CHECKBE-NEXT:    vmov.16 q1[4], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q3[4]
-; CHECKBE-NEXT:    vmov.16 q1[5], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q2[7]
-; CHECKBE-NEXT:    vmov.16 q1[6], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q3[6]
-; CHECKBE-NEXT:    vmov.16 q1[7], r0
+; CHECKBE-NEXT:    vrev64.16 q2, q0
+; CHECKBE-NEXT:    vrev64.16 q0, q1
+; CHECKBE-NEXT:    vmovx.f16 s5, s1
+; CHECKBE-NEXT:    vins.f16 s5, s9
+; CHECKBE-NEXT:    vmovx.f16 s4, s0
+; CHECKBE-NEXT:    vins.f16 s4, s8
+; CHECKBE-NEXT:    vmovx.f16 s6, s2
+; CHECKBE-NEXT:    vins.f16 s6, s10
+; CHECKBE-NEXT:    vmovx.f16 s7, s3
+; CHECKBE-NEXT:    vins.f16 s7, s11
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
 entry:
@@ -442,46 +426,30 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: vmovn16_b3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
 ; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmovx.f16 s1, s9
+; CHECK-NEXT:    vins.f16 s1, s5
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s2, s10
+; CHECK-NEXT:    vins.f16 s2, s6
+; CHECK-NEXT:    vmovx.f16 s3, s11
+; CHECK-NEXT:    vins.f16 s3, s7
 ; CHECK-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: vmovn16_b3:
 ; CHECKBE:       @ %bb.0: @ %entry
 ; CHECKBE-NEXT:    vrev64.16 q3, q0
+; CHECKBE-NEXT:    vrev64.16 q2, q1
+; CHECKBE-NEXT:    vmovx.f16 s5, s13
+; CHECKBE-NEXT:    vins.f16 s5, s9
+; CHECKBE-NEXT:    vmovx.f16 s4, s12
+; CHECKBE-NEXT:    vins.f16 s4, s8
+; CHECKBE-NEXT:    vmovx.f16 s6, s14
+; CHECKBE-NEXT:    vins.f16 s6, s10
+; CHECKBE-NEXT:    vmovx.f16 s7, s15
+; CHECKBE-NEXT:    vins.f16 s7, s11
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
-; CHECKBE-NEXT:    vmov.u16 r0, q3[1]
-; CHECKBE-NEXT:    vmov.16 q2[0], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q0[0]
-; CHECKBE-NEXT:    vmov.16 q2[1], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q3[3]
-; CHECKBE-NEXT:    vmov.16 q2[2], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q0[2]
-; CHECKBE-NEXT:    vmov.16 q2[3], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q3[5]
-; CHECKBE-NEXT:    vmov.16 q2[4], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q0[4]
-; CHECKBE-NEXT:    vmov.16 q2[5], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q3[7]
-; CHECKBE-NEXT:    vmov.16 q2[6], r0
-; CHECKBE-NEXT:    vmov.u16 r0, q0[6]
-; CHECKBE-NEXT:    vmov.16 q2[7], r0
-; CHECKBE-NEXT:    vrev64.16 q0, q2
 ; CHECKBE-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 1, i32 8, i32 3, i32 10, i32 5, i32 12, i32 7, i32 14>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
index aba29b4e5e48..e4ae6e5dff3a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
@@ -314,22 +314,14 @@ entry:
 define arm_aapcs_vfpcc void @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) {
 ; CHECK-LABEL: vmovn16_b2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
-; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[7]
-; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    vmov.16 q2[7], r1
+; CHECK-NEXT:    vmovx.f16 s9, s5
+; CHECK-NEXT:    vins.f16 s9, s1
+; CHECK-NEXT:    vmovx.f16 s8, s4
+; CHECK-NEXT:    vins.f16 s8, s0
+; CHECK-NEXT:    vmovx.f16 s10, s6
+; CHECK-NEXT:    vins.f16 s10, s2
+; CHECK-NEXT:    vmovx.f16 s11, s7
+; CHECK-NEXT:    vins.f16 s11, s3
 ; CHECK-NEXT:    vstrw.32 q2, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -341,22 +333,14 @@ entry:
 define arm_aapcs_vfpcc void @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) {
 ; CHECK-LABEL: vmovn16_b3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
-; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
-; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[6]
-; CHECK-NEXT:    vmov.16 q2[7], r1
+; CHECK-NEXT:    vmovx.f16 s9, s1
+; CHECK-NEXT:    vins.f16 s9, s5
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vmovx.f16 s10, s2
+; CHECK-NEXT:    vins.f16 s10, s6
+; CHECK-NEXT:    vmovx.f16 s11, s3
+; CHECK-NEXT:    vins.f16 s11, s7
 ; CHECK-NEXT:    vstrw.32 q2, [r0]
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
index 60e3c6c10279..53cf18ce04f1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
@@ -208,40 +208,32 @@ define void @vst2_v8i16_align1(<8 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-LABEL: vst2_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d4, d3
 ; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[6]
-; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vins.f16 s8, s2
+; CHECK-NEXT:    vmov.f64 d6, d2
+; CHECK-NEXT:    vmov.16 q2[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q2[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vins.f16 s12, s0
+; CHECK-NEXT:    vins.f16 s10, s3
+; CHECK-NEXT:    vmov.16 q2[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q2[7], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
 ; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.16 q3[5], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vstrb.8 q2, [r1, #16]
+; CHECK-NEXT:    vins.f16 s14, s1
 ; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
 ; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
 ; CHECK-NEXT:    vstrb.8 q3, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -522,20 +514,14 @@ define void @vst2_v2f16(<2 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vst2_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    ldrd r2, r0, [r0]
-; CHECK-NEXT:    vmov.32 q0[0], r2
-; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmovx.f16 s5, s4
+; CHECK-NEXT:    vins.f16 s4, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vins.f16 s5, s0
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r2, s9
+; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    str r2, [r1, #4]
 ; CHECK-NEXT:    bx lr
@@ -552,33 +538,24 @@ entry:
 define void @vst2_v4f16(<4 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vst2_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    ldm.w r0, {r2, r3, r12}
-; CHECK-NEXT:    vmov.32 q0[0], r12
-; CHECK-NEXT:    ldr r0, [r0, #12]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.32 q2[1], r3
+; CHECK-NEXT:    ldrd r2, r12, [r0]
+; CHECK-NEXT:    ldrd r3, r0, [r0, #8]
+; CHECK-NEXT:    vmov.32 q1[0], r2
+; CHECK-NEXT:    vmov.32 q0[0], r3
+; CHECK-NEXT:    vmov.32 q1[1], r12
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmovx.f16 s12, s8
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[0], r2
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s0
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s8, s9
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmovx.f16 s12, s4
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s12, s8
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vmov.f32 s9, s12
 ; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vstrh.16 q1, [r1]
+; CHECK-NEXT:    vmovx.f16 s11, s5
+; CHECK-NEXT:    vins.f16 s5, s1
+; CHECK-NEXT:    vmov.f32 s10, s5
+; CHECK-NEXT:    vins.f16 s11, s0
+; CHECK-NEXT:    vstrh.16 q2, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
@@ -633,49 +610,30 @@ entry:
 define void @vst2_v8f16_align1(<8 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vst2_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmovx.f16 s12, s6
-; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmovx.f16 s9, s6
+; CHECK-NEXT:    vins.f16 s6, s2
+; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmovx.f16 s12, s2
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s7
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vins.f16 s9, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s3
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s11, s7
+; CHECK-NEXT:    vins.f16 s7, s3
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vins.f16 s11, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    vstrb.8 q2, [r1, #16]
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmovx.f16 s4, s5
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s0
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s12, s8
+; CHECK-NEXT:    vmov q2, q1
 ; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov.f32 s9, s12
+; CHECK-NEXT:    vmovx.f16 s11, s5
+; CHECK-NEXT:    vins.f16 s5, s1
+; CHECK-NEXT:    vmov.f32 s10, s5
+; CHECK-NEXT:    vins.f16 s11, s0
 ; CHECK-NEXT:    vstrb.8 q2, [r1]
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 9fcdc6385453..855f199d4a68 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -385,78 +385,74 @@ define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.16 q0[7], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.16 q3[0], r2
-; CHECK-NEXT:    vmov.u16 r0, q4[0]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[1]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[2]
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d8, d2
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vins.f16 s16, s8
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vins.f16 s5, s9
+; CHECK-NEXT:    vmov.f32 s19, s5
 ; CHECK-NEXT:    vdup.32 q5, r2
-; CHECK-NEXT:    vmov.f32 s13, s8
+; CHECK-NEXT:    vmov.f32 s17, s4
 ; CHECK-NEXT:    vmov.u16 r2, q5[2]
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
 ; CHECK-NEXT:    vmov.16 q6[2], r2
-; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
 ; CHECK-NEXT:    vmov.16 q6[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
 ; CHECK-NEXT:    vmov.16 q6[4], r0
 ; CHECK-NEXT:    vmov.u16 r0, q5[5]
 ; CHECK-NEXT:    vmov.16 q6[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[5]
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
 ; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
 ; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
 ; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[7]
-; CHECK-NEXT:    vmov.f32 s13, s25
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.f32 s17, s25
 ; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.f32 s14, s26
+; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vmov.f32 s18, s26
 ; CHECK-NEXT:    vmov.16 q5[7], r0
 ; CHECK-NEXT:    vdup.32 q6, r2
-; CHECK-NEXT:    vmov.f32 s1, s5
+; CHECK-NEXT:    vmov.f32 s22, s15
 ; CHECK-NEXT:    vmov.u16 r2, q6[2]
-; CHECK-NEXT:    vmov.f32 s22, s7
-; CHECK-NEXT:    vrev32.16 q4, q4
-; CHECK-NEXT:    vmov.16 q7[2], r2
 ; CHECK-NEXT:    vmov.u16 r0, q5[3]
-; CHECK-NEXT:    vmov.u16 r2, q4[2]
-; CHECK-NEXT:    vmov.f32 s2, s10
+; CHECK-NEXT:    vmov.16 q7[2], r2
+; CHECK-NEXT:    vmov.f32 s1, s13
 ; CHECK-NEXT:    vmov.16 q7[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q5[4]
-; CHECK-NEXT:    vstrw.32 q3, [r1]
-; CHECK-NEXT:    vmov.16 q3[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vrev32.16 q2, q2
 ; CHECK-NEXT:    vmov.16 q7[4], r0
 ; CHECK-NEXT:    vmov.u16 r0, q6[5]
-; CHECK-NEXT:    vmov.16 q3[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov.u16 r2, q2[2]
+; CHECK-NEXT:    vmov.f32 s2, s6
 ; CHECK-NEXT:    vmov.16 q7[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[5]
-; CHECK-NEXT:    vmov.16 q3[4], r2
-; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
 ; CHECK-NEXT:    vmov.f32 s21, s29
-; CHECK-NEXT:    vmov.f32 s1, s13
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov.16 q2[4], r2
 ; CHECK-NEXT:    vmov.f32 s22, s30
-; CHECK-NEXT:    vmov.f32 s2, s14
+; CHECK-NEXT:    vmov.16 q2[5], r0
 ; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s1, s9
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vmov.f32 s2, s10
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
@@ -482,181 +478,173 @@ define void @vst3_v16i16(<16 x i16> *%src, <48 x i16> *%dst) {
 ; CHECK-NEXT:    .pad #160
 ; CHECK-NEXT:    sub sp, #160
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov.f64 d4, d0
+; CHECK-NEXT:    vmov.u16 r2, q7[1]
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #128] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
-; CHECK-NEXT:    vmov.16 q2[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vins.f16 s8, s28
 ; CHECK-NEXT:    vmov.16 q2[4], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov.16 q2[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov.16 q2[7], r2
-; CHECK-NEXT:    vmov q7, q1
-; CHECK-NEXT:    vmov.f32 s9, s0
+; CHECK-NEXT:    vmov.f32 s11, s1
+; CHECK-NEXT:    vins.f16 s11, s29
+; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmov.f32 s17, s0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmov.u16 r2, q4[3]
+; CHECK-NEXT:    vstrw.32 q4, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vdup.32 q0, r3
-; CHECK-NEXT:    vstrw.32 q2, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.u16 r3, q0[2]
-; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q2[2], r3
-; CHECK-NEXT:    vmov.16 q2[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[4]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.16 q2[4], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-NEXT:    vmov.16 q2[5], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[5]
-; CHECK-NEXT:    vmov.16 q5[0], r2
+; CHECK-NEXT:    vmov.16 q3[2], r3
+; CHECK-NEXT:    vmov.16 q3[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q4[4]
+; CHECK-NEXT:    vmov.16 q3[4], r2
 ; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov.16 q5[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[6]
-; CHECK-NEXT:    vmov.16 q5[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[7]
-; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vstrw.32 q0, [sp, #144] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s22, s3
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vstrw.32 q2, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vmov.u16 r2, q5[3]
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vmov.16 q3[5], r2
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d12, d0
+; CHECK-NEXT:    vstrw.32 q0, [sp, #144] @ 16-byte Spill
+; CHECK-NEXT:    vins.f16 s24, s4
+; CHECK-NEXT:    vmov.16 q6[4], r2
+; CHECK-NEXT:    vmov.f32 s27, s1
+; CHECK-NEXT:    vins.f16 s27, s5
+; CHECK-NEXT:    vmov.f32 s25, s0
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vmov.u16 r2, q6[3]
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov q5, q0
 ; CHECK-NEXT:    vdup.32 q0, r0
-; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q5, [sp, #112] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q5[4]
-; CHECK-NEXT:    vmov.16 q4[3], r2
-; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q6[4]
+; CHECK-NEXT:    vmov.16 q3[3], r2
+; CHECK-NEXT:    vmov.16 q3[4], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q4[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmov.16 q6[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[5]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q6[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[1]
-; CHECK-NEXT:    vstrw.32 q4, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q6[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[7]
 ; CHECK-NEXT:    vdup.32 q0, r2
-; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.16 q4[7], r0
 ; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov.f32 s25, s8
-; CHECK-NEXT:    vmov.16 q2[2], r2
-; CHECK-NEXT:    vmov.u16 r0, q6[3]
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q6[4]
-; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov.f32 s18, s23
+; CHECK-NEXT:    vmov.16 q1[2], r2
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vmov.16 q1[4], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q2[5], r0
+; CHECK-NEXT:    vmov.16 q1[5], r0
 ; CHECK-NEXT:    vmov.u16 r0, q7[5]
-; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov.16 q1[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vstrw.32 q3, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.16 q1[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q7[6]
-; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.16 q1[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q7[7]
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vmov.f32 s2, s19
-; CHECK-NEXT:    vstrw.32 q2, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vdup.32 q7, r2
-; CHECK-NEXT:    vrev32.16 q3, q3
-; CHECK-NEXT:    vmov.u16 r2, q7[2]
-; CHECK-NEXT:    vstrw.32 q3, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q2[2], r2
-; CHECK-NEXT:    vstrw.32 q4, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[5]
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[2]
-; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vmov.f32 s6, s15
+; CHECK-NEXT:    vmov r2, s11
 ; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[4]
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vrev32.16 q7, q7
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vdup.32 q0, r2
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.16 q5[2], r2
+; CHECK-NEXT:    vmov.16 q5[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q5[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[2]
+; CHECK-NEXT:    vrev32.16 q0, q7
 ; CHECK-NEXT:    vmov.16 q7[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[3]
-; CHECK-NEXT:    vmov.f32 s1, s9
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.16 q7[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s10
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.16 q7[6], r0
-; CHECK-NEXT:    vmov.f32 s25, s5
-; CHECK-NEXT:    vmov.u16 r0, q4[5]
-; CHECK-NEXT:    vmov.f32 s26, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s21, s5
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT:    vrev32.16 q0, q0
 ; CHECK-NEXT:    vmov.16 q7[7], r0
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[3]
+; CHECK-NEXT:    vmov.f32 s5, s21
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s25, s9
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.f32 s26, s10
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s6, s22
+; CHECK-NEXT:    vstrw.32 q6, [r1]
+; CHECK-NEXT:    vmov.f32 s13, s9
+; CHECK-NEXT:    vstrw.32 q1, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s14, s10
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s17, s9
+; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.u16 r2, q1[2]
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.f32 s1, s9
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT:    vmov.16 q1[2], r2
+; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s2, s10
+; CHECK-NEXT:    vmov.u16 r2, q0[3]
+; CHECK-NEXT:    vmov.16 q1[3], r2
+; CHECK-NEXT:    vmov.u16 r2, q0[4]
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.f32 s1, s5
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s29, s5
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #128] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.u16 r2, q0[2]
 ; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.f32 s9, s5
+; CHECK-NEXT:    vmov.f32 s30, s6
 ; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vmov.f32 s10, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
-; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s29, s5
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s30, s18
-; CHECK-NEXT:    vstrw.32 q6, [r1]
 ; CHECK-NEXT:    vmov.u16 r2, q7[3]
-; CHECK-NEXT:    vmov.f32 s13, s5
 ; CHECK-NEXT:    vmov.16 q0[3], r2
 ; CHECK-NEXT:    vmov.u16 r2, q7[4]
 ; CHECK-NEXT:    vmov.16 q0[4], r2
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #128] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.16 q0[5], r0
 ; CHECK-NEXT:    vmov.f32 s29, s1
 ; CHECK-NEXT:    vmov.f32 s30, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s14, s6
-; CHECK-NEXT:    vstrw.32 q7, [r1, #16]
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[3]
-; CHECK-NEXT:    vmov.16 q0[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[4]
-; CHECK-NEXT:    vmov.16 q0[4], r2
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.f32 s13, s1
-; CHECK-NEXT:    vmov.f32 s14, s2
-; CHECK-NEXT:    vstrw.32 q3, [r1, #64]
+; CHECK-NEXT:    vstrw.32 q7, [r1, #64]
 ; CHECK-NEXT:    add sp, #160
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
@@ -1417,27 +1405,20 @@ entry:
 define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) {
 ; CHECK-LABEL: vst3_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldmia r0, {s4, s5}
-; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vldmia r0, {s0, s1}
 ; CHECK-NEXT:    ldr r0, [r0, #8]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vmov.16 q0[0], r3
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    vmovx.f16 s6, s8
+; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vmov.f32 s13, s8
+; CHECK-NEXT:    vmovx.f16 s14, s1
+; CHECK-NEXT:    vins.f16 s14, s6
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s4, s5
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmovx.f16 s4, s8
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r2, s13
+; CHECK-NEXT:    vmov r3, s14
 ; CHECK-NEXT:    stm r1!, {r0, r2, r3}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1463,42 +1444,28 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
 ; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
 ; CHECK-NEXT:    vmov q0[2], q0[0], lr, r3
-; CHECK-NEXT:    vmov q0[3], q0[1], r12, r2
 ; CHECK-NEXT:    vmov.32 q1[0], r4
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov q0[3], q0[1], r12, r2
+; CHECK-NEXT:    vmovx.f16 s8, s0
 ; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.16 q2[0], r3
-; CHECK-NEXT:    vmov.16 q2[1], r2
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmovx.f16 s12, s0
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s2
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vins.f16 s0, s2
 ; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vmov.f32 s9, s4
+; CHECK-NEXT:    vmovx.f16 s10, s2
+; CHECK-NEXT:    vins.f16 s10, s12
+; CHECK-NEXT:    vmovx.f16 s12, s1
+; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vmov.f32 s11, s1
 ; CHECK-NEXT:    vstrw.32 q2, [r1]
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmovx.f16 s0, s5
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmovx.f16 s8, s5
+; CHECK-NEXT:    vins.f16 s5, s12
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmovx.f16 s5, s3
+; CHECK-NEXT:    vins.f16 s5, s8
+; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov r0, s5
 ; CHECK-NEXT:    strd r2, r0, [r1, #16]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
@@ -1518,100 +1485,65 @@ entry:
 define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) {
 ; CHECK-LABEL: vst3_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.16 q0[0], r3
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov r0, s20
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vmovx.f16 s20, s12
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vmovx.f16 s24, s6
+; CHECK-NEXT:    vmov.f64 d0, d6
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vmov.16 q0[4], r2
 ; CHECK-NEXT:    vdup.32 q4, r0
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov r2, s5
-; CHECK-NEXT:    vmovx.f16 s12, s8
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    vmov.16 q0[7], r2
-; CHECK-NEXT:    vmov r2, s12
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.f32 s1, s8
-; CHECK-NEXT:    vmov.16 q3[3], r2
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmovx.f16 s16, s18
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmovx.f16 s16, s22
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    vmovx.f16 s16, s6
+; CHECK-NEXT:    vmov.f32 s3, s13
+; CHECK-NEXT:    vins.f16 s17, s20
+; CHECK-NEXT:    vins.f16 s3, s9
+; CHECK-NEXT:    vmovx.f16 s20, s18
+; CHECK-NEXT:    vmov.f32 s1, s12
+; CHECK-NEXT:    vins.f16 s2, s20
+; CHECK-NEXT:    vmovx.f16 s20, s10
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vmov.16 q5[3], r0
 ; CHECK-NEXT:    vmovx.f16 s24, s7
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q4[0], r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov r0, s24
-; CHECK-NEXT:    vmovx.f16 s24, s23
-; CHECK-NEXT:    vmov.16 q4[6], r0
-; CHECK-NEXT:    vmov r0, s24
-; CHECK-NEXT:    vdup.32 q7, r2
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vmov r2, s29
-; CHECK-NEXT:    vmov.f32 s18, s23
-; CHECK-NEXT:    vmovx.f16 s24, s17
-; CHECK-NEXT:    vmov r0, s24
-; CHECK-NEXT:    vmov.16 q6[2], r2
-; CHECK-NEXT:    vmov.16 q6[3], r0
-; CHECK-NEXT:    vmovx.f16 s28, s30
-; CHECK-NEXT:    vmovx.f16 s4, s10
-; CHECK-NEXT:    vmov.f32 s1, s13
-; CHECK-NEXT:    vmov.f32 s2, s14
+; CHECK-NEXT:    vmovx.f16 s23, s11
+; CHECK-NEXT:    vmov r0, s15
+; CHECK-NEXT:    vins.f16 s23, s24
+; CHECK-NEXT:    vdup.32 q6, r0
+; CHECK-NEXT:    vmov.f32 s22, s7
+; CHECK-NEXT:    vmovx.f16 s28, s21
+; CHECK-NEXT:    vins.f16 s25, s28
+; CHECK-NEXT:    vmovx.f16 s28, s26
+; CHECK-NEXT:    vins.f16 s22, s28
+; CHECK-NEXT:    vmovx.f16 s28, s13
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vrev32.16 q2, q2
+; CHECK-NEXT:    vins.f16 s4, s28
+; CHECK-NEXT:    vmovx.f16 s28, s14
+; CHECK-NEXT:    vins.f16 s6, s28
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s7, s6
+; CHECK-NEXT:    vmov.f32 s6, s14
+; CHECK-NEXT:    vmovx.f16 s12, s5
+; CHECK-NEXT:    vins.f16 s9, s12
+; CHECK-NEXT:    vmovx.f16 s12, s10
+; CHECK-NEXT:    vins.f16 s6, s12
+; CHECK-NEXT:    vmov.f32 s26, s22
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f32 s1, s17
+; CHECK-NEXT:    vmov.f32 s21, s25
+; CHECK-NEXT:    vmov.f32 s5, s9
+; CHECK-NEXT:    vmov.f32 s2, s18
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vmov r0, s28
-; CHECK-NEXT:    vmov.16 q6[5], r0
-; CHECK-NEXT:    vmovx.f16 s28, s9
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vmov r2, s28
-; CHECK-NEXT:    vmov.16 q7[0], r0
-; CHECK-NEXT:    vmov.16 q7[1], r2
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov.16 q7[6], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.16 q7[7], r0
-; CHECK-NEXT:    vmov.f32 s17, s25
-; CHECK-NEXT:    vmov.f32 s29, s21
-; CHECK-NEXT:    vmov.f32 s30, s10
-; CHECK-NEXT:    vmovx.f16 s4, s29
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vrev32.16 q2, q1
-; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmovx.f16 s8, s10
-; CHECK-NEXT:    vmov.16 q1[2], r2
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov r0, s30
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.f32 s29, s5
-; CHECK-NEXT:    vmov.f32 s30, s6
-; CHECK-NEXT:    vstrw.32 q7, [r1, #16]
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vmov.f32 s22, s26
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
@@ -1634,200 +1566,141 @@ define void @vst3_v16f16(<16 x half> *%src, <48 x half> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #144
 ; CHECK-NEXT:    sub sp, #144
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov.16 q1[0], r3
-; CHECK-NEXT:    vmovx.f16 s0, s8
-; CHECK-NEXT:    vmov.16 q1[1], r2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vmovx.f16 s0, s12
-; CHECK-NEXT:    vmov.16 q1[4], r2
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmov.16 q1[6], r2
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmov.16 q1[7], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-NEXT:    vmov.f32 s5, s12
-; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:    vstrw.32 q3, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vdup.32 q0, r3
-; CHECK-NEXT:    vstrw.32 q1, [sp, #128] @ 16-byte Spill
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmovx.f16 s0, s2
-; CHECK-NEXT:    vmov.16 q3[2], r3
-; CHECK-NEXT:    vstrw.32 q2, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q3[3], r2
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.16 q3[4], r2
+; CHECK-NEXT:    vmov q6, q2
 ; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vmovx.f16 s20, s14
+; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vins.f16 s0, s12
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vins.f16 s3, s13
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vmov.f32 s5, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vdup.32 q4, r2
+; CHECK-NEXT:    vmov r2, s15
+; CHECK-NEXT:    vins.f16 s17, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s18
-; CHECK-NEXT:    vmov.16 q3[5], r2
-; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vins.f16 s6, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s10
-; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vins.f16 s20, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s11
-; CHECK-NEXT:    vmov.16 q1[0], r3
-; CHECK-NEXT:    vmov r3, s23
-; CHECK-NEXT:    vmov.16 q1[1], r2
-; CHECK-NEXT:    vmov r2, s11
-; CHECK-NEXT:    vmov.16 q1[3], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s0, s19
-; CHECK-NEXT:    vmov.16 q1[6], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
-; CHECK-NEXT:    vmov.16 q1[7], r2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s6, s19
-; CHECK-NEXT:    vmovx.f16 s0, s5
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vdup.32 q0, r3
-; CHECK-NEXT:    vmov r3, s1
-; CHECK-NEXT:    vmovx.f16 s0, s2
-; CHECK-NEXT:    vmov.16 q2[2], r3
-; CHECK-NEXT:    vmov r3, s20
-; CHECK-NEXT:    vmov.16 q2[3], r2
-; CHECK-NEXT:    vmov.16 q7[0], r3
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vmov.16 q5[3], r2
+; CHECK-NEXT:    vmov r2, s27
+; CHECK-NEXT:    vmovx.f16 s23, s15
+; CHECK-NEXT:    vmov.f32 s18, s6
+; CHECK-NEXT:    vins.f16 s23, s0
+; CHECK-NEXT:    vstrw.32 q1, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s22, s11
+; CHECK-NEXT:    vmovx.f16 s0, s21
+; CHECK-NEXT:    vdup.32 q1, r2
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vins.f16 s5, s0
+; CHECK-NEXT:    vmovx.f16 s0, s6
+; CHECK-NEXT:    vins.f16 s22, s0
+; CHECK-NEXT:    vmovx.f16 s0, s12
+; CHECK-NEXT:    vmov.f32 s6, s22
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #80]
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q4, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.16 q2[4], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov.16 q2[5], r2
-; CHECK-NEXT:    vstrw.32 q2, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmovx.f16 s0, s8
-; CHECK-NEXT:    vmov.16 q7[1], r2
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    vstrw.32 q4, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d14, d2
+; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vins.f16 s28, s12
+; CHECK-NEXT:    vmov r0, s24
 ; CHECK-NEXT:    vmov.16 q7[4], r2
-; CHECK-NEXT:    vmov r2, s21
-; CHECK-NEXT:    vmov.16 q7[6], r2
-; CHECK-NEXT:    vmov r2, s9
-; CHECK-NEXT:    vmovx.f16 s0, s20
-; CHECK-NEXT:    vmov.16 q7[7], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vdup.32 q0, r0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmovx.f16 s0, s2
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.f32 s29, s20
-; CHECK-NEXT:    vmov.16 q1[3], r2
-; CHECK-NEXT:    vmov r0, s30
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmovx.f16 s0, s14
-; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov q1, q2
-; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.f32 s8, s9
+; CHECK-NEXT:    vmov.f32 s31, s5
+; CHECK-NEXT:    vins.f16 s31, s13
+; CHECK-NEXT:    vmov.f32 s29, s4
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vins.f16 s5, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s6
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s0, s7
-; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmov r2, s23
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmovx.f16 s0, s15
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.f32 s10, s15
-; CHECK-NEXT:    vmovx.f16 s0, s9
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vdup.32 q0, r2
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmovx.f16 s0, s2
-; CHECK-NEXT:    vmov.16 q6[2], r2
-; CHECK-NEXT:    vmov.16 q6[3], r0
-; CHECK-NEXT:    vmov r0, s10
-; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q6[5], r0
-; CHECK-NEXT:    vmov r0, s13
-; CHECK-NEXT:    vmovx.f16 s0, s21
-; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s0, s22
-; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov.16 q4[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vmov.f32 s9, s25
-; CHECK-NEXT:    vmov.f32 s17, s13
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s18, s22
-; CHECK-NEXT:    vmovx.f16 s0, s17
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vrev32.16 q0, q1
-; CHECK-NEXT:    vmov r2, s1
-; CHECK-NEXT:    vmovx.f16 s0, s2
-; CHECK-NEXT:    vmov.16 q1[2], r2
-; CHECK-NEXT:    vmov.f32 s10, s26
+; CHECK-NEXT:    vins.f16 s30, s0
+; CHECK-NEXT:    vmov q0, q3
+; CHECK-NEXT:    vmov.f32 s6, s30
+; CHECK-NEXT:    vmovx.f16 s12, s26
+; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vins.f16 s4, s12
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q2, [r1, #80]
-; CHECK-NEXT:    vmov r0, s18
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmovx.f16 s0, s13
-; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s0, s14
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.16 q5[1], r2
-; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.f32 s21, s5
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s22, s14
-; CHECK-NEXT:    vmovx.f16 s0, s21
-; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmovx.f16 s7, s3
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmovx.f16 s12, s27
+; CHECK-NEXT:    vins.f16 s7, s12
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    vmov.f32 s6, s27
+; CHECK-NEXT:    vmovx.f16 s16, s5
+; CHECK-NEXT:    vdup.32 q3, r0
+; CHECK-NEXT:    vmov.f32 s24, s25
+; CHECK-NEXT:    vins.f16 s13, s16
+; CHECK-NEXT:    vmovx.f16 s16, s14
+; CHECK-NEXT:    vins.f16 s6, s16
+; CHECK-NEXT:    vmovx.f16 s16, s1
+; CHECK-NEXT:    vins.f16 s24, s16
+; CHECK-NEXT:    vmovx.f16 s16, s2
+; CHECK-NEXT:    vins.f16 s26, s16
+; CHECK-NEXT:    vmov.f32 s14, s6
+; CHECK-NEXT:    vmov.f32 s27, s26
+; CHECK-NEXT:    vstrw.32 q3, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s26, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmovx.f16 s16, s25
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vrev32.16 q0, q0
+; CHECK-NEXT:    vins.f16 s1, s16
+; CHECK-NEXT:    vmovx.f16 s16, s2
+; CHECK-NEXT:    vins.f16 s26, s16
+; CHECK-NEXT:    vmovx.f16 s16, s13
+; CHECK-NEXT:    vmov.f32 s2, s26
+; CHECK-NEXT:    vins.f16 s8, s16
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vrev32.16 q3, q0
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmovx.f16 s12, s14
-; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.f32 s13, s25
-; CHECK-NEXT:    vmov.f32 s14, s26
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s25, s5
+; CHECK-NEXT:    vmovx.f16 s16, s14
+; CHECK-NEXT:    vins.f16 s10, s16
+; CHECK-NEXT:    vrev32.16 q4, q0
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s11, s10
+; CHECK-NEXT:    vmov.f32 s10, s14
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmov.f32 s5, s1
+; CHECK-NEXT:    vins.f16 s17, s12
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vmovx.f16 s12, s18
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s10, s12
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vmov.f32 s13, s1
+; CHECK-NEXT:    vstrw.32 q1, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s14, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s9, s17
 ; CHECK-NEXT:    vstrw.32 q3, [r1]
 ; CHECK-NEXT:    vmov.f32 s21, s1
-; CHECK-NEXT:    vmov.f32 s26, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s22, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s29, s5
-; CHECK-NEXT:    vstrw.32 q6, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s17, s1
-; CHECK-NEXT:    vstrw.32 q5, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s30, s6
-; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s29, s1
+; CHECK-NEXT:    vmov.f32 s30, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s10, s18
 ; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
-; CHECK-NEXT:    vstrw.32 q4, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s25, s1
+; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s26, s2
+; CHECK-NEXT:    vstrw.32 q6, [r1, #64]
 ; CHECK-NEXT:    add sp, #144
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index 3a2dc53feb2e..9ead8139f3e4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -422,76 +422,64 @@ define void @vst4_v8i16_align1(<8 x i16> *%src, <32 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.u16 r2, q2[2]
-; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov.16 q4[6], r0
-; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d0, d8
+; CHECK-NEXT:    vmov.u16 r0, q4[1]
+; CHECK-NEXT:    vmov.f32 s21, s5
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vins.f16 s21, s21
 ; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmov.f32 s1, s17
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
-; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.16 q4[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vmov.f32 s12, s17
 ; CHECK-NEXT:    vmov.16 q5[6], r0
+; CHECK-NEXT:    vins.f16 s12, s9
 ; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov.f32 s25, s4
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vins.f16 s25, s25
+; CHECK-NEXT:    vmov.16 q3[5], r0
 ; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q4[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[6]
-; CHECK-NEXT:    vmov.f32 s17, s21
-; CHECK-NEXT:    vmov.f32 s19, s23
-; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-NEXT:    vstrb.8 q4, [r1]
-; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[6]
-; CHECK-NEXT:    vmov.16 q6[2], r0
-; CHECK-NEXT:    vmov.16 q6[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
 ; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vmov.f32 s13, s21
 ; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.f32 s15, s23
+; CHECK-NEXT:    vmov.f32 s20, s19
+; CHECK-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-NEXT:    vmov.f32 s1, s25
+; CHECK-NEXT:    vins.f16 s20, s11
+; CHECK-NEXT:    vmov.f32 s3, s27
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.f32 s25, s7
 ; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vins.f16 s25, s25
+; CHECK-NEXT:    vstrb.8 q3, [r1, #16]
+; CHECK-NEXT:    vmov.16 q6[6], r0
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    vmov.16 q6[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
 ; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.u16 r0, q4[5]
 ; CHECK-NEXT:    vmov.f32 s21, s25
 ; CHECK-NEXT:    vmov.f32 s23, s27
-; CHECK-NEXT:    vmov.16 q6[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.f64 d12, d9
 ; CHECK-NEXT:    vstrb.8 q5, [r1, #48]
-; CHECK-NEXT:    vmov.16 q6[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.f32 s17, s6
+; CHECK-NEXT:    vins.f16 s24, s10
 ; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vins.f16 s17, s17
 ; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
 ; CHECK-NEXT:    vmov.16 q6[5], r0
-; CHECK-NEXT:    vmov.f32 s25, s9
-; CHECK-NEXT:    vmov.f32 s27, s11
+; CHECK-NEXT:    vmov.f32 s25, s17
+; CHECK-NEXT:    vmov.f32 s27, s19
 ; CHECK-NEXT:    vstrb.8 q6, [r1, #32]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
@@ -1057,28 +1045,18 @@ define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vst4_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldmia r0, {s4, s5}
-; CHECK-NEXT:    vmov r2, s5
+; CHECK-NEXT:    vmovx.f16 s12, s5
 ; CHECK-NEXT:    ldr r0, [r0, #8]
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vmov.16 q0[0], r3
+; CHECK-NEXT:    vmov.f64 d0, d2
 ; CHECK-NEXT:    vdup.32 q2, r0
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov r0, s9
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s4, s5
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmovx.f16 s4, s8
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vins.f16 s0, s5
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vins.f16 s1, s9
+; CHECK-NEXT:    vmovx.f16 s2, s4
 ; CHECK-NEXT:    vmovx.f16 s4, s9
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vins.f16 s2, s12
+; CHECK-NEXT:    vmovx.f16 s3, s8
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1105,52 +1083,32 @@ define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) {
 ; CHECK-NEXT:    ldrd lr, r12, [r0]
 ; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
 ; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
-; CHECK-NEXT:    vmov q1[2], q1[0], lr, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r12, r2
-; CHECK-NEXT:    vmov q0[2], q0[0], r4, r4
-; CHECK-NEXT:    vmov r3, s5
-; CHECK-NEXT:    vmov q0[3], q0[1], r0, r0
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vmov.16 q2[0], r3
-; CHECK-NEXT:    vmov.16 q2[1], r2
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s3
-; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s7
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s1
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmov q0[2], q0[0], lr, r3
+; CHECK-NEXT:    vmov q0[3], q0[1], r12, r2
+; CHECK-NEXT:    vmov q1[2], q1[0], r4, r4
+; CHECK-NEXT:    vmov.f32 s8, s1
+; CHECK-NEXT:    vmov q1[3], q1[1], r0, r0
+; CHECK-NEXT:    vins.f16 s8, s3
 ; CHECK-NEXT:    vmovx.f16 s12, s3
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov r2, s4
+; CHECK-NEXT:    vmov.f32 s9, s5
+; CHECK-NEXT:    vins.f16 s9, s7
+; CHECK-NEXT:    vmovx.f16 s10, s1
+; CHECK-NEXT:    vins.f16 s10, s12
+; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vmovx.f16 s11, s5
+; CHECK-NEXT:    vins.f16 s11, s12
 ; CHECK-NEXT:    vstrh.16 q2, [r1, #16]
-; CHECK-NEXT:    vmov r0, s6
-; CHECK-NEXT:    vmov.16 q2[0], r2
-; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r0, s12
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmovx.f16 s10, s2
+; CHECK-NEXT:    vins.f16 s8, s10
 ; CHECK-NEXT:    vmovx.f16 s4, s6
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s2
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vstrh.16 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vins.f16 s3, s4
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
@@ -1238,101 +1196,56 @@ entry:
 define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) {
 ; CHECK-LABEL: vst4_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    vldrw.u32 q7, [r0]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vmov r3, s30
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vmov.16 q1[0], r3
-; CHECK-NEXT:    vmovx.f16 s0, s30
-; CHECK-NEXT:    vmov.16 q1[1], r2
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s0, s21
-; CHECK-NEXT:    vmov.16 q1[4], r2
-; CHECK-NEXT:    vmov r0, s21
-; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov r2, s29
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov.16 q3[0], r2
-; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov q1, q2
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vmovx.f16 s0, s29
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmovx.f16 s0, s5
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmovx.f16 s0, s20
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vmov r2, s4
-; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q4[6], r0
-; CHECK-NEXT:    vmovx.f16 s0, s28
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vmov r0, s28
-; CHECK-NEXT:    vmov.16 q6[0], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q6[1], r2
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q6[5], r0
-; CHECK-NEXT:    vmov r0, s23
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmovx.f16 s0, s23
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov r2, s31
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmovx.f16 s28, s31
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov r0, s28
-; CHECK-NEXT:    vmovx.f16 s28, s7
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov r0, s28
-; CHECK-NEXT:    vmovx.f16 s20, s22
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov r0, s22
-; CHECK-NEXT:    vmov.16 q7[2], r0
-; CHECK-NEXT:    vmov.f32 s25, s17
-; CHECK-NEXT:    vmov.16 q7[3], r0
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vmov.16 q7[6], r0
-; CHECK-NEXT:    vmovx.f16 s20, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q7[7], r0
-; CHECK-NEXT:    vmov r0, s20
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.f32 s1, s9
-; CHECK-NEXT:    vmov.f32 s13, s21
-; CHECK-NEXT:    vmov.f32 s5, s29
-; CHECK-NEXT:    vmov.f32 s15, s23
-; CHECK-NEXT:    vmov.f32 s27, s19
-; CHECK-NEXT:    vstrb.8 q3, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s3, s11
-; CHECK-NEXT:    vstrb.8 q6, [r1]
-; CHECK-NEXT:    vmov.f32 s7, s31
-; CHECK-NEXT:    vstrb.8 q0, [r1, #48]
-; CHECK-NEXT:    vstrb.8 q1, [r1, #32]
-; CHECK-NEXT:    add sp, #32
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-NEXT:    vmovx.f16 s2, s21
+; CHECK-NEXT:    vins.f16 s21, s17
+; CHECK-NEXT:    vmov.f32 s0, s21
+; CHECK-NEXT:    vmovx.f16 s4, s17
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s8, s20
+; CHECK-NEXT:    vmovx.f16 s4, s16
+; CHECK-NEXT:    vins.f16 s20, s16
+; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vmov q1, q5
+; CHECK-NEXT:    vmov.f32 s6, s8
+; CHECK-NEXT:    vmovx.f16 s10, s23
+; CHECK-NEXT:    vins.f16 s23, s19
+; CHECK-NEXT:    vmovx.f16 s12, s19
+; CHECK-NEXT:    vmov.f32 s8, s23
+; CHECK-NEXT:    vmovx.f16 s16, s18
+; CHECK-NEXT:    vins.f16 s10, s12
+; CHECK-NEXT:    vmovx.f16 s14, s22
+; CHECK-NEXT:    vins.f16 s22, s18
+; CHECK-NEXT:    vmov.f32 s12, s22
+; CHECK-NEXT:    vins.f16 s14, s16
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s13, s18
+; CHECK-NEXT:    vmovx.f16 s24, s17
+; CHECK-NEXT:    vins.f16 s17, s17
+; CHECK-NEXT:    vins.f16 s24, s24
+; CHECK-NEXT:    vmov q5, q4
+; CHECK-NEXT:    vmovx.f16 s7, s16
+; CHECK-NEXT:    vmov.f32 s23, s24
+; CHECK-NEXT:    vins.f16 s16, s16
+; CHECK-NEXT:    vmov.f32 s5, s16
+; CHECK-NEXT:    vmovx.f16 s11, s19
+; CHECK-NEXT:    vins.f16 s19, s19
+; CHECK-NEXT:    vins.f16 s13, s13
+; CHECK-NEXT:    vmov.f32 s9, s19
+; CHECK-NEXT:    vmovx.f16 s15, s18
+; CHECK-NEXT:    vmov.f32 s1, s21
+; CHECK-NEXT:    vins.f16 s7, s7
+; CHECK-NEXT:    vins.f16 s11, s11
+; CHECK-NEXT:    vins.f16 s15, s15
+; CHECK-NEXT:    vstrb.8 q2, [r1, #48]
+; CHECK-NEXT:    vstrb.8 q3, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s3, s23
+; CHECK-NEXT:    vstrb.8 q1, [r1]
+; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0


        


More information about the llvm-commits mailing list