[llvm] 21a4faa - [ARM] Move double vector insert patterns using vins to DAG combine

David Green via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 22 01:30:05 PST 2021


Author: David Green
Date: 2021-02-22T09:29:47Z
New Revision: 21a4faab60c34b8a8c4d09a5ffac50ded8163208

URL: https://github.com/llvm/llvm-project/commit/21a4faab60c34b8a8c4d09a5ffac50ded8163208
DIFF: https://github.com/llvm/llvm-project/commit/21a4faab60c34b8a8c4d09a5ffac50ded8163208.diff

LOG: [ARM] Move double vector insert patterns using vins to DAG combine

This removes the existing patterns for inserting two lanes into an
f16/i16 vector register using VINS, instead using a DAG combine to
pattern match the same code sequences. The tablegen patterns were
already on the large side (foreach LANE = [0, 2, 4, 6]) and were not
handling all the cases they could. Moving that to a DAG combine, whilst
not less code, allows us to better control and expand the selection of
VINSs. Additionally this allows us to remove the AddedComplexity on
VCVTT.

The extra trick that this has learned in the process is to move two
adjacent lanes using a single f32 vmov, allowing some extra
inefficiencies to be removed.

Differenial Revision: https://reviews.llvm.org/D96876

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
    llvm/lib/Target/ARM/ARMInstrMVE.td
    llvm/lib/Target/ARM/ARMInstrVFP.td
    llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
    llvm/test/CodeGen/Thumb2/mve-div-expand.ll
    llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
    llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
    llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
    llvm/test/CodeGen/Thumb2/mve-minmax.ll
    llvm/test/CodeGen/Thumb2/mve-shuffle.ll
    llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
    llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
    llvm/test/CodeGen/Thumb2/mve-vcvt.ll
    llvm/test/CodeGen/Thumb2/mve-vld2.ll
    llvm/test/CodeGen/Thumb2/mve-vld3.ll
    llvm/test/CodeGen/Thumb2/mve-vld4.ll
    llvm/test/CodeGen/Thumb2/mve-vldst4.ll
    llvm/test/CodeGen/Thumb2/mve-vst2.ll
    llvm/test/CodeGen/Thumb2/mve-vst3.ll
    llvm/test/CodeGen/Thumb2/mve-vst4.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 2a9a31dab74f..0da62aea3fad 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -297,6 +297,8 @@ class ARMDAGToDAGISel : public SelectionDAGISel {
   /// Try to select SBFX/UBFX instructions for ARM.
   bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
 
+  bool tryInsertVectorElt(SDNode *N);
+
   // Select special operations if node forms integer ABS pattern
   bool tryABSOp(SDNode *N);
 
@@ -3022,6 +3024,107 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
   CurDAG->RemoveDeadNode(N);
 }
 
+bool ARMDAGToDAGISel::tryInsertVectorElt(SDNode *N) {
+  if (!Subtarget->hasMVEIntegerOps())
+    return false;
+
+  SDLoc dl(N);
+
+  // We are trying to use VMOV/VMOVX/VINS to more efficiently lower insert and
+  // extracts of v8f16 and v8i16 vectors. Check that we have two adjacent
+  // inserts of the correct type:
+  SDValue Ins1 = SDValue(N, 0);
+  SDValue Ins2 = N->getOperand(0);
+  EVT VT = Ins1.getValueType();
+  if (Ins2.getOpcode() != ISD::INSERT_VECTOR_ELT || !Ins2.hasOneUse() ||
+      !isa<ConstantSDNode>(Ins1.getOperand(2)) ||
+      !isa<ConstantSDNode>(Ins2.getOperand(2)) ||
+      (VT != MVT::v8f16 && VT != MVT::v8i16) || (Ins2.getValueType() != VT))
+    return false;
+
+  unsigned Lane1 = Ins1.getConstantOperandVal(2);
+  unsigned Lane2 = Ins2.getConstantOperandVal(2);
+  if (Lane2 % 2 != 0 || Lane1 != Lane2 + 1)
+    return false;
+
+  // If the inserted values will be able to use T/B already, leave it to the
+  // existing tablegen patterns. For example VCVTT/VCVTB.
+  SDValue Val1 = Ins1.getOperand(1);
+  SDValue Val2 = Ins2.getOperand(1);
+  if (Val1.getOpcode() == ISD::FP_ROUND || Val2.getOpcode() == ISD::FP_ROUND)
+    return false;
+
+  // Check if the inserted values are both extracts.
+  if ((Val1.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+       Val1.getOpcode() == ARMISD::VGETLANEu) &&
+      (Val2.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+       Val2.getOpcode() == ARMISD::VGETLANEu) &&
+      isa<ConstantSDNode>(Val1.getOperand(1)) &&
+      isa<ConstantSDNode>(Val2.getOperand(1)) &&
+      (Val1.getOperand(0).getValueType() == MVT::v8f16 ||
+       Val1.getOperand(0).getValueType() == MVT::v8i16) &&
+      (Val2.getOperand(0).getValueType() == MVT::v8f16 ||
+       Val2.getOperand(0).getValueType() == MVT::v8i16)) {
+    unsigned ExtractLane1 = Val1.getConstantOperandVal(1);
+    unsigned ExtractLane2 = Val2.getConstantOperandVal(1);
+
+    // If the two extracted lanes are from the same place and adjacent, this
+    // simplifies into a f32 lane move.
+    if (Val1.getOperand(0) == Val2.getOperand(0) && ExtractLane2 % 2 == 0 &&
+        ExtractLane1 == ExtractLane2 + 1) {
+      SDValue NewExt = CurDAG->getTargetExtractSubreg(
+          ARM::ssub_0 + ExtractLane2 / 2, dl, MVT::f32, Val1.getOperand(0));
+      SDValue NewIns = CurDAG->getTargetInsertSubreg(
+          ARM::ssub_0 + Lane2 / 2, dl, VT, Ins2.getOperand(0),
+          NewExt);
+      ReplaceUses(Ins1, NewIns);
+      return true;
+    }
+
+    // Else v8i16 pattern of an extract and an insert, with a optional vmovx for
+    // extracting odd lanes.
+    if (VT == MVT::v8i16) {
+      SDValue Inp1 = CurDAG->getTargetExtractSubreg(
+          ARM::ssub_0 + ExtractLane1 / 2, dl, MVT::f32, Val1.getOperand(0));
+      SDValue Inp2 = CurDAG->getTargetExtractSubreg(
+          ARM::ssub_0 + ExtractLane2 / 2, dl, MVT::f32, Val2.getOperand(0));
+      if (ExtractLane1 % 2 != 0)
+        Inp1 = SDValue(CurDAG->getMachineNode(ARM::VMOVH, dl, MVT::f32, Inp1), 0);
+      if (ExtractLane2 % 2 != 0)
+        Inp2 = SDValue(CurDAG->getMachineNode(ARM::VMOVH, dl, MVT::f32, Inp2), 0);
+      SDNode *VINS = CurDAG->getMachineNode(ARM::VINSH, dl, MVT::f32, Inp2, Inp1);
+      SDValue NewIns =
+          CurDAG->getTargetInsertSubreg(ARM::ssub_0 + Lane2 / 2, dl, MVT::v4f32,
+                                        Ins2.getOperand(0), SDValue(VINS, 0));
+      ReplaceUses(Ins1, NewIns);
+      return true;
+    }
+  }
+
+  // The inserted values are not extracted - if they are f16 then insert them
+  // directly using a VINS.
+  if (VT == MVT::v8f16) {
+    auto F32RC = CurDAG->getTargetConstant(ARM::SPRRegClassID, dl, MVT::i32);
+    SDNode *Val1Copy = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                              dl, MVT::f32, Val1, F32RC);
+    SDNode *Val2Copy = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                              dl, MVT::f32, Val2, F32RC);
+    auto MQPRRC = CurDAG->getTargetConstant(ARM::MQPRRegClassID, dl, MVT::i32);
+    SDNode *VecCopy =
+        CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, MVT::v4f32,
+                               Ins2.getOperand(0), MQPRRC);
+
+    SDNode *VINS = CurDAG->getMachineNode(ARM::VINSH, dl, MVT::f32, Val2, Val1);
+    SDValue NewIns =
+        CurDAG->getTargetInsertSubreg(ARM::ssub_0 + Lane2 / 2, dl, MVT::v4f32,
+                                      Ins2.getOperand(0), SDValue(VINS, 0));
+    ReplaceUses(Ins1, NewIns);
+    return true;
+  }
+
+  return false;
+}
+
 bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
   if (!Subtarget->hasV6T2Ops())
     return false;
@@ -3443,6 +3546,11 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       return;
     }
   }
+  case ISD::INSERT_VECTOR_ELT: {
+    if (tryInsertVectorElt(N))
+      return;
+    break;
+  }
   case ISD::SRL:
     if (tryV6T2BitfieldExtractOp(N, false))
       return;

diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index a728136b32bb..7d1c9017e3dc 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -1919,44 +1919,6 @@ let Predicates = [HasMVEInt] in {
             (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), (f16 HPR:$src), ssub_0)>;
   def : Pat<(v8f16 (scalar_to_vector GPR:$src)),
             (MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
-
-  foreach LANE = [0, 2, 4, 6] in {
-    defvar SSUB = !cast<SubRegIndex>("ssub_"#!srl(LANE, 1));
-
-    // v8f16 pattern for inserting two lanes using a VINS
-    def : Pat<(insertelt (insertelt (v8f16 MQPR:$srcV), (f16 HPR:$src1), LANE),
-                         (f16 HPR:$src2), !add(LANE,1)),
-              (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)),
-                                  (VINSH (COPY_TO_REGCLASS HPR:$src1, SPR),
-                                         (COPY_TO_REGCLASS HPR:$src2, SPR)),
-                                 SSUB), MQPR)>;
-
-    // v8i16 pattern for extracting 2 even lane elements and inserting them using a VINS
-    def : Pat<(ARMinsertelt (ARMinsertelt (v8i16 MQPR:$srcV),
-                                          (ARMvgetlaneu (v8i16 MQPR:$src1), imm_even:$lane1),
-                                          LANE),
-                           (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$lane2),
-                           !add(LANE,1)),
-              (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)),
-                                  (VINSH (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src1, MQPR)),
-                                                         (SSubReg_f16_reg imm_even:$lane1)),
-                                         (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src2, MQPR)),
-                                                         (SSubReg_f16_reg imm_even:$lane2))),
-                                 SSUB), MQPR)>;
-
-    // v8i16 pattern for extracting an element using VMOVX and inserting another using a VINS
-    def : Pat<(ARMinsertelt (ARMinsertelt (v8i16 MQPR:$srcV),
-                                          (ARMvgetlaneu (v8i16 MQPR:$src1), imm_odd:$lane1),
-                                          LANE),
-                            (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$lane2),
-                            !add(LANE,1)),
-              (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$srcV, MQPR)),
-                                  (VINSH (VMOVH (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src1, MQPR)),
-                                                                (SSubReg_f16_reg imm_odd:$lane1))),
-                                                (EXTRACT_SUBREG (v8f16 (COPY_TO_REGCLASS MQPR:$src2, MQPR)),
-                                                                (SSubReg_f16_reg imm_even:$lane2))),
-                                 SSUB), MQPR)>;
-  }
 }
 
 // end of mve_bit instructions

diff  --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 146161801c56..bcd6433a579b 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -798,8 +798,6 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  Requires<[HasFP16]>,
             Sched<[WriteFPCVT]>;
 
-// AddedComplexity to use over the dual-insert MVE pattern
-let AddedComplexity = 6 in
 def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
               (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH SPR:$src2),
                                     (SSubReg_f16_reg imm:$lane)))>;

diff  --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index 8b27a9348418..303c00170b51 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -372,8 +372,6 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    bpl .LBB2_8
 ; CHECK-NEXT:  .LBB2_7: @ %cond.load12
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vmovx.f16 s24, s20
-; CHECK-NEXT:    vins.f16 s20, s24
 ; CHECK-NEXT:    vldr.16 s24, [r0, #6]
 ; CHECK-NEXT:    vins.f16 s21, s24
 ; CHECK-NEXT:  .LBB2_8: @ %else13
@@ -418,14 +416,10 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
 ; CHECK-NEXT:    vldr.16 s24, [r0, #2]
 ; CHECK-NEXT:    vins.f16 s20, s24
-; CHECK-NEXT:    vmovx.f16 s24, s21
-; CHECK-NEXT:    vins.f16 s21, s24
 ; CHECK-NEXT:    lsls r4, r2, #29
 ; CHECK-NEXT:    bpl .LBB2_6
 ; CHECK-NEXT:  .LBB2_14: @ %cond.load9
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vmovx.f16 s24, s20
-; CHECK-NEXT:    vins.f16 s20, s24
 ; CHECK-NEXT:    vmovx.f16 s24, s21
 ; CHECK-NEXT:    vldr.16 s21, [r0, #4]
 ; CHECK-NEXT:    vins.f16 s21, s24
@@ -441,14 +435,10 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
 ; CHECK-NEXT:    vldr.16 s28, [r1, #2]
 ; CHECK-NEXT:    vins.f16 s24, s28
-; CHECK-NEXT:    vmovx.f16 s28, s25
-; CHECK-NEXT:    vins.f16 s25, s28
 ; CHECK-NEXT:    lsls r4, r2, #29
 ; CHECK-NEXT:    bpl .LBB2_11
 ; CHECK-NEXT:  .LBB2_17: @ %cond.load22
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vmovx.f16 s28, s24
-; CHECK-NEXT:    vins.f16 s24, s28
 ; CHECK-NEXT:    vmovx.f16 s28, s25
 ; CHECK-NEXT:    vldr.16 s25, [r1, #4]
 ; CHECK-NEXT:    vins.f16 s25, s28
@@ -456,8 +446,6 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    bpl.w .LBB2_2
 ; CHECK-NEXT:  .LBB2_18: @ %cond.load25
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vmovx.f16 s28, s24
-; CHECK-NEXT:    vins.f16 s24, s28
 ; CHECK-NEXT:    vldr.16 s28, [r1, #6]
 ; CHECK-NEXT:    vins.f16 s25, s28
 ; CHECK-NEXT:    b .LBB2_2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
index aa7b8fb1065d..4fcacf945a5c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
@@ -806,27 +806,27 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fdiv_f16(<8 x half> %in1, <8 x half> %in2) {
 ; CHECK-LABEL: fdiv_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vmovx.f16 s10, s0
-; CHECK-NEXT:    vdiv.f16 s12, s10, s8
-; CHECK-NEXT:    vdiv.f16 s8, s0, s4
-; CHECK-NEXT:    vins.f16 s8, s12
+; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s2, s8
+; CHECK-NEXT:    vmovx.f16 s14, s9
+; CHECK-NEXT:    vdiv.f16 s12, s2, s0
+; CHECK-NEXT:    vdiv.f16 s0, s8, s4
+; CHECK-NEXT:    vins.f16 s0, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vmovx.f16 s14, s1
-; CHECK-NEXT:    vdiv.f16 s9, s1, s5
 ; CHECK-NEXT:    vdiv.f16 s12, s14, s12
-; CHECK-NEXT:    vmovx.f16 s14, s2
-; CHECK-NEXT:    vins.f16 s9, s12
+; CHECK-NEXT:    vdiv.f16 s1, s9, s5
+; CHECK-NEXT:    vins.f16 s1, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s6
+; CHECK-NEXT:    vmovx.f16 s14, s10
+; CHECK-NEXT:    vdiv.f16 s2, s10, s6
 ; CHECK-NEXT:    vdiv.f16 s12, s14, s12
-; CHECK-NEXT:    vdiv.f16 s10, s2, s6
-; CHECK-NEXT:    vins.f16 s10, s12
+; CHECK-NEXT:    vmovx.f16 s14, s11
+; CHECK-NEXT:    vins.f16 s2, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s7
-; CHECK-NEXT:    vmovx.f16 s14, s3
-; CHECK-NEXT:    vdiv.f16 s11, s3, s7
 ; CHECK-NEXT:    vdiv.f16 s12, s14, s12
-; CHECK-NEXT:    vins.f16 s11, s12
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vdiv.f16 s3, s11, s7
+; CHECK-NEXT:    vins.f16 s3, s12
 ; CHECK-NEXT:    bx lr
 entry:
   %out = fdiv <8 x half> %in1, %in2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index 82005b9b00a5..eab2090e576e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1479,9 +1479,7 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    vmov.u16 r3, q2[1]
 ; CHECK-NEXT:    vfma.f16 q2, q6, r3
 ; CHECK-NEXT:    strh r3, [r5, #2]
-; CHECK-NEXT:    vmovx.f16 s6, s9
 ; CHECK-NEXT:    vmov.f32 s8, s9
-; CHECK-NEXT:    vins.f16 s8, s6
 ; CHECK-NEXT:    strh r7, [r5], #4
 ; CHECK-NEXT:    vmov.16 q2[2], r4
 ; CHECK-NEXT:    le lr, .LBB17_5

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
index f416aac644fb..df6fc9412a10 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
@@ -98,19 +98,19 @@ define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_sext(i16* %base, <8 x i16>* %off
 ; CHECK-NEXT:    vldr.16 s8, [r2]
 ; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    vldr.16 s0, [r2]
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    vins.f16 s0, s8
-; CHECK-NEXT:    vldr.16 s8, [r2]
+; CHECK-NEXT:    vmov r3, s7
 ; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT:    vldr.16 s4, [r3]
+; CHECK-NEXT:    vins.f16 s0, s8
 ; CHECK-NEXT:    vldr.16 s1, [r2]
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
 ; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vins.f16 s1, s8
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vldr.16 s8, [r0]
 ; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    vldr.16 s2, [r0]
+; CHECK-NEXT:    vldr.16 s8, [r1]
 ; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    vins.f16 s2, s8
 ; CHECK-NEXT:    vldr.16 s8, [r0]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
index cbb234176fda..7b9a10571a83 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -379,17 +379,17 @@ define arm_aapcs_vfpcc <8 x half> @ptr_f16(<8 x half*>* %offptr) {
 ; CHECK-NEXT:    vldr.16 s8, [r1]
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    vldr.16 s0, [r1]
-; CHECK-NEXT:    vmov r1, s7
-; CHECK-NEXT:    vins.f16 s0, s8
-; CHECK-NEXT:    vldr.16 s8, [r1]
+; CHECK-NEXT:    vmov r2, s7
 ; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldr.16 s4, [r2]
+; CHECK-NEXT:    vins.f16 s0, s8
 ; CHECK-NEXT:    vldr.16 s1, [r1]
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    vins.f16 s1, s8
-; CHECK-NEXT:    vldr.16 s8, [r0]
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s5
 ; CHECK-NEXT:    vldr.16 s2, [r0]
+; CHECK-NEXT:    vldr.16 s8, [r1]
 ; CHECK-NEXT:    vmov r0, s7
 ; CHECK-NEXT:    vins.f16 s2, s8
 ; CHECK-NEXT:    vldr.16 s8, [r0]

diff  --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
index d5f4236a29bc..5a3cbe4bfd72 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
@@ -851,8 +851,6 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    bpl .LBB18_5
 ; CHECK-LE-NEXT:  .LBB18_4: @ %cond.load7
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vins.f16 s0, s4
 ; CHECK-LE-NEXT:    vldr.16 s4, [r2, #6]
 ; CHECK-LE-NEXT:    vins.f16 s1, s4
 ; CHECK-LE-NEXT:  .LBB18_5: @ %else8
@@ -899,13 +897,9 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-LE-NEXT:  .LBB18_7: @ %cond.load1
 ; CHECK-LE-NEXT:    vldr.16 s4, [r2, #2]
 ; CHECK-LE-NEXT:    vins.f16 s0, s4
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
-; CHECK-LE-NEXT:    vins.f16 s1, s4
 ; CHECK-LE-NEXT:    lsls r3, r1, #29
 ; CHECK-LE-NEXT:    bpl .LBB18_3
 ; CHECK-LE-NEXT:  .LBB18_8: @ %cond.load4
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vins.f16 s0, s4
 ; CHECK-LE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-LE-NEXT:    vldr.16 s1, [r2, #4]
 ; CHECK-LE-NEXT:    vins.f16 s1, s4
@@ -948,8 +942,6 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    beq .LBB18_5
 ; CHECK-BE-NEXT:  .LBB18_4: @ %cond.load7
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vins.f16 s0, s4
 ; CHECK-BE-NEXT:    vldr.16 s4, [r2, #6]
 ; CHECK-BE-NEXT:    vins.f16 s1, s4
 ; CHECK-BE-NEXT:  .LBB18_5: @ %else8
@@ -996,13 +988,9 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-BE-NEXT:  .LBB18_7: @ %cond.load1
 ; CHECK-BE-NEXT:    vldr.16 s4, [r2, #2]
 ; CHECK-BE-NEXT:    vins.f16 s0, s4
-; CHECK-BE-NEXT:    vmovx.f16 s4, s1
-; CHECK-BE-NEXT:    vins.f16 s1, s4
 ; CHECK-BE-NEXT:    lsls r3, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB18_3
 ; CHECK-BE-NEXT:  .LBB18_8: @ %cond.load4
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vins.f16 s0, s4
 ; CHECK-BE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-BE-NEXT:    vldr.16 s1, [r2, #4]
 ; CHECK-BE-NEXT:    vins.f16 s1, s4
@@ -1054,8 +1042,6 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    bpl .LBB19_5
 ; CHECK-LE-NEXT:  .LBB19_4: @ %cond.load7
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vins.f16 s0, s4
 ; CHECK-LE-NEXT:    vldr.16 s4, [r2, #6]
 ; CHECK-LE-NEXT:    vins.f16 s1, s4
 ; CHECK-LE-NEXT:  .LBB19_5: @ %else8
@@ -1102,13 +1088,9 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-LE-NEXT:  .LBB19_7: @ %cond.load1
 ; CHECK-LE-NEXT:    vldr.16 s4, [r2, #2]
 ; CHECK-LE-NEXT:    vins.f16 s0, s4
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
-; CHECK-LE-NEXT:    vins.f16 s1, s4
 ; CHECK-LE-NEXT:    lsls r3, r1, #29
 ; CHECK-LE-NEXT:    bpl .LBB19_3
 ; CHECK-LE-NEXT:  .LBB19_8: @ %cond.load4
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vins.f16 s0, s4
 ; CHECK-LE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-LE-NEXT:    vldr.16 s1, [r2, #4]
 ; CHECK-LE-NEXT:    vins.f16 s1, s4
@@ -1151,8 +1133,6 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    beq .LBB19_5
 ; CHECK-BE-NEXT:  .LBB19_4: @ %cond.load7
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vins.f16 s0, s4
 ; CHECK-BE-NEXT:    vldr.16 s4, [r2, #6]
 ; CHECK-BE-NEXT:    vins.f16 s1, s4
 ; CHECK-BE-NEXT:  .LBB19_5: @ %else8
@@ -1199,13 +1179,9 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-BE-NEXT:  .LBB19_7: @ %cond.load1
 ; CHECK-BE-NEXT:    vldr.16 s4, [r2, #2]
 ; CHECK-BE-NEXT:    vins.f16 s0, s4
-; CHECK-BE-NEXT:    vmovx.f16 s4, s1
-; CHECK-BE-NEXT:    vins.f16 s1, s4
 ; CHECK-BE-NEXT:    lsls r3, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB19_3
 ; CHECK-BE-NEXT:  .LBB19_8: @ %cond.load4
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vins.f16 s0, s4
 ; CHECK-BE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-BE-NEXT:    vldr.16 s1, [r2, #4]
 ; CHECK-BE-NEXT:    vins.f16 s1, s4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
index aab2f3ea729e..1b45d27317ac 100644
--- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
@@ -315,27 +315,27 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @minnm_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: minnm_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
-; CHECK-MVE-NEXT:    vminnm.f16 s12, s10, s8
-; CHECK-MVE-NEXT:    vminnm.f16 s8, s4, s0
-; CHECK-MVE-NEXT:    vins.f16 s8, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
+; CHECK-MVE-NEXT:    vmov q2, q0
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vminnm.f16 s9, s5, s1
+; CHECK-MVE-NEXT:    vminnm.f16 s12, s2, s0
+; CHECK-MVE-NEXT:    vminnm.f16 s0, s4, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
 ; CHECK-MVE-NEXT:    vminnm.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vminnm.f16 s1, s5, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vins.f16 s9, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vminnm.f16 s2, s6, s10
 ; CHECK-MVE-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vminnm.f16 s10, s6, s2
-; CHECK-MVE-NEXT:    vins.f16 s10, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s3
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vminnm.f16 s11, s7, s3
+; CHECK-MVE-NEXT:    vins.f16 s2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
 ; CHECK-MVE-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vins.f16 s11, s12
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vminnm.f16 s3, s7, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s12
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: minnm_float16_t:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index 9dfc9e528eb8..128cfd0c2e2b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -201,15 +201,15 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) {
 ; CHECK-LABEL: shuffle1_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s3
-; CHECK-NEXT:    vins.f16 s4, s3
-; CHECK-NEXT:    vmovx.f16 s5, s2
-; CHECK-NEXT:    vins.f16 s5, s2
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vins.f16 s6, s1
-; CHECK-NEXT:    vmovx.f16 s7, s0
-; CHECK-NEXT:    vins.f16 s7, s0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmovx.f16 s0, s7
+; CHECK-NEXT:    vins.f16 s0, s7
+; CHECK-NEXT:    vmovx.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s2, s5
+; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -228,16 +228,15 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
 ; CHECK-LABEL: shuffle3_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s5, s3
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vins.f16 s5, s3
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vins.f16 s1, s0
-; CHECK-NEXT:    vmov.f32 s7, s1
-; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmovx.f16 s1, s7
+; CHECK-NEXT:    vmovx.f16 s8, s4
+; CHECK-NEXT:    vins.f16 s1, s7
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s5, s4
+; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov.f32 s0, s6
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
@@ -325,31 +324,30 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle2step_i16(<16 x i16> %src) {
 ; CHECK-LABEL: shuffle2step_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.f32 s8, s0
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vins.f16 s8, s1
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.f32 s9, s2
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vins.f16 s9, s3
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.f32 s10, s4
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vins.f16 s10, s5
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.f32 s11, s6
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vins.f16 s11, s7
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vadd.i16 q0, q2, q3
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vmovx.f16 s9, s2
+; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vmovx.f16 s10, s4
+; CHECK-NEXT:    vmovx.f16 s16, s1
+; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    vins.f16 s8, s16
+; CHECK-NEXT:    vmovx.f16 s0, s3
+; CHECK-NEXT:    vins.f16 s4, s5
+; CHECK-NEXT:    vins.f16 s9, s0
+; CHECK-NEXT:    vmovx.f16 s0, s5
+; CHECK-NEXT:    vins.f16 s10, s0
+; CHECK-NEXT:    vmov.f32 s13, s2
+; CHECK-NEXT:    vmov.f32 s14, s4
+; CHECK-NEXT:    vmovx.f16 s0, s7
+; CHECK-NEXT:    vmovx.f16 s11, s6
+; CHECK-NEXT:    vins.f16 s6, s7
+; CHECK-NEXT:    vins.f16 s11, s0
+; CHECK-NEXT:    vmov.f32 s15, s6
+; CHECK-NEXT:    vadd.i16 q0, q3, q2
+; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -361,59 +359,51 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle3step_i16(<32 x i16> %src) {
 ; CHECK-LABEL: shuffle3step_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.16 q4[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmov.f32 s15, s19
-; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmov.f32 s22, s8
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vmovx.f16 s16, s2
+; CHECK-NEXT:    vmov.f32 s12, s1
+; CHECK-NEXT:    vins.f16 s12, s16
+; CHECK-NEXT:    vmovx.f16 s16, s5
+; CHECK-NEXT:    vmov.f32 s13, s4
+; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    vins.f16 s13, s16
+; CHECK-NEXT:    vmov.f32 s19, s10
+; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    vmov.f32 s18, s8
 ; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov q6, q5
-; CHECK-NEXT:    vmovnb.i32 q6, q4
-; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vmov.f32 s19, s23
-; CHECK-NEXT:    vins.f16 s22, s8
-; CHECK-NEXT:    vmovx.f16 s23, s9
-; CHECK-NEXT:    vins.f16 s23, s11
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vins.f16 s8, s2
-; CHECK-NEXT:    vmovx.f16 s9, s3
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vins.f16 s9, s5
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmovnb.i32 q0, q2
-; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s11, s23
-; CHECK-NEXT:    vadd.i16 q0, q3, q2
-; CHECK-NEXT:    vadd.i16 q0, q0, q4
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vmov q5, q4
+; CHECK-NEXT:    vmovnb.i32 q5, q3
+; CHECK-NEXT:    vmov.f32 s14, s22
+; CHECK-NEXT:    vmovx.f16 s20, s1
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.f32 s16, s0
+; CHECK-NEXT:    vins.f16 s16, s20
+; CHECK-NEXT:    vmovx.f16 s20, s4
+; CHECK-NEXT:    vmov.f32 s17, s3
+; CHECK-NEXT:    vins.f16 s17, s20
+; CHECK-NEXT:    vmovx.f16 s20, s7
+; CHECK-NEXT:    vmov.f32 s18, s6
+; CHECK-NEXT:    vins.f16 s18, s20
+; CHECK-NEXT:    vmovx.f16 s20, s10
+; CHECK-NEXT:    vmov.f32 s19, s9
+; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmovx.f16 s20, s0
+; CHECK-NEXT:    vins.f16 s20, s2
+; CHECK-NEXT:    vmovx.f16 s21, s3
+; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vmovx.f16 s3, s9
+; CHECK-NEXT:    vins.f16 s21, s5
+; CHECK-NEXT:    vins.f16 s3, s11
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmovnb.i32 q1, q5
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s23, s3
+; CHECK-NEXT:    vadd.i16 q0, q4, q5
+; CHECK-NEXT:    vadd.i16 q0, q0, q3
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -427,64 +417,53 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) {
 ; CHECK-LABEL: shuffle4step_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    vmovx.f16 s18, s9
+; CHECK-NEXT:    vins.f16 s18, s20
+; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmovx.f16 s19, s13
+; CHECK-NEXT:    vins.f16 s9, s11
+; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmovx.f16 s20, s3
+; CHECK-NEXT:    vmovx.f16 s16, s1
+; CHECK-NEXT:    vins.f16 s13, s15
+; CHECK-NEXT:    vins.f16 s16, s20
+; CHECK-NEXT:    vmovx.f16 s20, s7
+; CHECK-NEXT:    vmovx.f16 s17, s5
+; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vins.f16 s17, s20
 ; CHECK-NEXT:    vmov.f32 s22, s9
-; CHECK-NEXT:    vmov.16 q6[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q6[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q6[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vins.f16 s22, s11
-; CHECK-NEXT:    vmov.16 q7[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
 ; CHECK-NEXT:    vmov.f32 s23, s13
-; CHECK-NEXT:    vmov.16 q7[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vins.f16 s23, s15
-; CHECK-NEXT:    vmov.16 q7[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vins.f16 s5, s7
 ; CHECK-NEXT:    vmov.f32 s20, s1
-; CHECK-NEXT:    vmov.16 q7[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q6[3], r0
-; CHECK-NEXT:    vins.f16 s20, s3
-; CHECK-NEXT:    vmov.f32 s18, s8
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmovx.f16 s24, s10
 ; CHECK-NEXT:    vmov.f32 s21, s5
-; CHECK-NEXT:    vins.f16 s18, s10
-; CHECK-NEXT:    vmov.f32 s26, s30
-; CHECK-NEXT:    vins.f16 s21, s7
-; CHECK-NEXT:    vmov.f32 s27, s31
-; CHECK-NEXT:    vmov.f32 s19, s12
-; CHECK-NEXT:    vadd.i16 q5, q5, q6
-; CHECK-NEXT:    vmov.16 q6[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vins.f16 s19, s14
-; CHECK-NEXT:    vmov.16 q6[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.f32 s16, s0
-; CHECK-NEXT:    vmov.16 q6[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vins.f16 s16, s2
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[1]
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q6[3], r0
-; CHECK-NEXT:    vins.f16 s17, s6
-; CHECK-NEXT:    vmov.f32 s26, s2
-; CHECK-NEXT:    vmov.f32 s27, s3
-; CHECK-NEXT:    vadd.i16 q0, q4, q6
+; CHECK-NEXT:    vadd.i16 q4, q5, q4
+; CHECK-NEXT:    vmovx.f16 s22, s8
+; CHECK-NEXT:    vins.f16 s22, s24
+; CHECK-NEXT:    vins.f16 s8, s10
+; CHECK-NEXT:    vmovx.f16 s24, s14
+; CHECK-NEXT:    vmovx.f16 s23, s12
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vmov.f32 s10, s8
+; CHECK-NEXT:    vmov.f32 s11, s12
+; CHECK-NEXT:    vins.f16 s23, s24
+; CHECK-NEXT:    vmovx.f16 s24, s2
+; CHECK-NEXT:    vmovx.f16 s20, s0
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vmovx.f16 s24, s6
+; CHECK-NEXT:    vmovx.f16 s21, s4
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s2, s10
+; CHECK-NEXT:    vmov.f32 s3, s11
 ; CHECK-NEXT:    vadd.i16 q0, q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vadd.i16 q0, q0, q4
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -1364,30 +1343,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle2step_f16(<16 x half> %src) {
 ; CHECKFP-LABEL: shuffle2step_f16:
 ; CHECKFP:       @ %bb.0: @ %entry
-; CHECKFP-NEXT:    .vsave {d8}
-; CHECKFP-NEXT:    vpush {d8}
-; CHECKFP-NEXT:    vmovx.f16 s16, s1
-; CHECKFP-NEXT:    vmovx.f16 s12, s0
-; CHECKFP-NEXT:    vmov.f32 s8, s0
-; CHECKFP-NEXT:    vins.f16 s12, s16
-; CHECKFP-NEXT:    vins.f16 s8, s1
-; CHECKFP-NEXT:    vmovx.f16 s13, s2
-; CHECKFP-NEXT:    vmovx.f16 s16, s3
-; CHECKFP-NEXT:    vmov.f32 s9, s2
-; CHECKFP-NEXT:    vins.f16 s13, s16
-; CHECKFP-NEXT:    vins.f16 s9, s3
-; CHECKFP-NEXT:    vmovx.f16 s0, s5
-; CHECKFP-NEXT:    vmovx.f16 s14, s4
-; CHECKFP-NEXT:    vmov.f32 s10, s4
-; CHECKFP-NEXT:    vins.f16 s14, s0
-; CHECKFP-NEXT:    vins.f16 s10, s5
-; CHECKFP-NEXT:    vmovx.f16 s0, s7
-; CHECKFP-NEXT:    vmovx.f16 s15, s6
+; CHECKFP-NEXT:    vmovx.f16 s12, s1
+; CHECKFP-NEXT:    vmovx.f16 s8, s0
+; CHECKFP-NEXT:    vins.f16 s8, s12
+; CHECKFP-NEXT:    vmovx.f16 s12, s3
+; CHECKFP-NEXT:    vmovx.f16 s9, s2
+; CHECKFP-NEXT:    vins.f16 s0, s1
+; CHECKFP-NEXT:    vins.f16 s9, s12
+; CHECKFP-NEXT:    vins.f16 s2, s3
+; CHECKFP-NEXT:    vmovx.f16 s12, s5
+; CHECKFP-NEXT:    vmovx.f16 s10, s4
+; CHECKFP-NEXT:    vins.f16 s10, s12
+; CHECKFP-NEXT:    vins.f16 s4, s5
+; CHECKFP-NEXT:    vmov.f32 s1, s2
+; CHECKFP-NEXT:    vmovx.f16 s12, s7
+; CHECKFP-NEXT:    vmovx.f16 s11, s6
 ; CHECKFP-NEXT:    vins.f16 s6, s7
-; CHECKFP-NEXT:    vins.f16 s15, s0
-; CHECKFP-NEXT:    vmov.f32 s11, s6
-; CHECKFP-NEXT:    vadd.f16 q0, q2, q3
-; CHECKFP-NEXT:    vpop {d8}
+; CHECKFP-NEXT:    vmov.f32 s2, s4
+; CHECKFP-NEXT:    vins.f16 s11, s12
+; CHECKFP-NEXT:    vmov.f32 s3, s6
+; CHECKFP-NEXT:    vadd.f16 q0, q0, q2
 ; CHECKFP-NEXT:    bx lr
 entry:
   %s1 = shufflevector <16 x half> %src, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -1399,8 +1374,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) {
 ; CHECKFP-LABEL: shuffle3step_f16:
 ; CHECKFP:       @ %bb.0: @ %entry
-; CHECKFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECKFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECKFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECKFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECKFP-NEXT:    vmovx.f16 s16, s2
 ; CHECKFP-NEXT:    vmov.f32 s12, s1
 ; CHECKFP-NEXT:    vins.f16 s12, s16
@@ -1412,32 +1387,32 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) {
 ; CHECKFP-NEXT:    vins.f16 s19, s20
 ; CHECKFP-NEXT:    vmov.f32 s14, s7
 ; CHECKFP-NEXT:    vmovx.f16 s20, s8
-; CHECKFP-NEXT:    vmovx.f16 s24, s1
+; CHECKFP-NEXT:    vmov.f32 s28, s6
 ; CHECKFP-NEXT:    vins.f16 s14, s20
-; CHECKFP-NEXT:    vmov.f32 s20, s0
-; CHECKFP-NEXT:    vins.f16 s20, s24
-; CHECKFP-NEXT:    vmovx.f16 s24, s4
-; CHECKFP-NEXT:    vmov.f32 s21, s3
-; CHECKFP-NEXT:    vins.f16 s21, s24
-; CHECKFP-NEXT:    vmovx.f16 s24, s7
-; CHECKFP-NEXT:    vmov.f32 s22, s6
-; CHECKFP-NEXT:    vins.f16 s22, s24
-; CHECKFP-NEXT:    vmovx.f16 s24, s0
-; CHECKFP-NEXT:    vins.f16 s24, s2
+; CHECKFP-NEXT:    vmovx.f16 s20, s7
+; CHECKFP-NEXT:    vins.f16 s28, s20
+; CHECKFP-NEXT:    vmovx.f16 s24, s1
+; CHECKFP-NEXT:    vmovx.f16 s20, s0
+; CHECKFP-NEXT:    vins.f16 s0, s24
+; CHECKFP-NEXT:    vins.f16 s20, s2
+; CHECKFP-NEXT:    vmovx.f16 s26, s4
+; CHECKFP-NEXT:    vmovx.f16 s21, s3
+; CHECKFP-NEXT:    vins.f16 s3, s26
+; CHECKFP-NEXT:    vins.f16 s21, s5
+; CHECKFP-NEXT:    vmovx.f16 s30, s10
+; CHECKFP-NEXT:    vmovx.f16 s23, s9
 ; CHECKFP-NEXT:    vmov.f32 s18, s8
-; CHECKFP-NEXT:    vmovx.f16 s25, s3
-; CHECKFP-NEXT:    vmovx.f16 s0, s10
-; CHECKFP-NEXT:    vins.f16 s25, s5
+; CHECKFP-NEXT:    vmov.f32 s1, s3
+; CHECKFP-NEXT:    vins.f16 s9, s30
+; CHECKFP-NEXT:    vins.f16 s23, s11
+; CHECKFP-NEXT:    vmov.f32 s2, s28
+; CHECKFP-NEXT:    vmovx.f16 s22, s6
+; CHECKFP-NEXT:    vmov.f32 s3, s9
+; CHECKFP-NEXT:    vins.f16 s22, s8
 ; CHECKFP-NEXT:    vmov.f32 s15, s19
-; CHECKFP-NEXT:    vmovx.f16 s27, s9
-; CHECKFP-NEXT:    vins.f16 s9, s0
-; CHECKFP-NEXT:    vins.f16 s27, s11
-; CHECKFP-NEXT:    vmov.f32 s23, s9
-; CHECKFP-NEXT:    vmovx.f16 s26, s6
-; CHECKFP-NEXT:    vins.f16 s26, s8
-; CHECKFP-NEXT:    vadd.f16 q0, q5, q6
+; CHECKFP-NEXT:    vadd.f16 q0, q0, q5
 ; CHECKFP-NEXT:    vadd.f16 q0, q0, q3
-; CHECKFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECKFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECKFP-NEXT:    bx lr
 entry:
   %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -1451,8 +1426,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle4step_f16(<32 x half> %src) {
 ; CHECKFP-LABEL: shuffle4step_f16:
 ; CHECKFP:       @ %bb.0: @ %entry
-; CHECKFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
-; CHECKFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
+; CHECKFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECKFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECKFP-NEXT:    vmovx.f16 s20, s11
 ; CHECKFP-NEXT:    vmovx.f16 s18, s9
 ; CHECKFP-NEXT:    vins.f16 s18, s20
@@ -1462,42 +1437,40 @@ define arm_aapcs_vfpcc <8 x half> @shuffle4step_f16(<32 x half> %src) {
 ; CHECKFP-NEXT:    vins.f16 s19, s20
 ; CHECKFP-NEXT:    vmovx.f16 s20, s3
 ; CHECKFP-NEXT:    vmovx.f16 s16, s1
-; CHECKFP-NEXT:    vmovx.f16 s28, s10
+; CHECKFP-NEXT:    vmovx.f16 s24, s10
 ; CHECKFP-NEXT:    vins.f16 s16, s20
-; CHECKFP-NEXT:    vmovx.f16 s26, s8
 ; CHECKFP-NEXT:    vmovx.f16 s20, s7
 ; CHECKFP-NEXT:    vmovx.f16 s17, s5
-; CHECKFP-NEXT:    vins.f16 s17, s20
-; CHECKFP-NEXT:    vmov.f32 s22, s9
-; CHECKFP-NEXT:    vins.f16 s8, s10
 ; CHECKFP-NEXT:    vins.f16 s13, s15
-; CHECKFP-NEXT:    vins.f16 s26, s28
-; CHECKFP-NEXT:    vmov.f32 s23, s13
-; CHECKFP-NEXT:    vmovx.f16 s28, s14
-; CHECKFP-NEXT:    vmovx.f16 s27, s12
-; CHECKFP-NEXT:    vmov.f32 s10, s8
-; CHECKFP-NEXT:    vins.f16 s12, s14
-; CHECKFP-NEXT:    vmov.f32 s11, s12
-; CHECKFP-NEXT:    vins.f16 s27, s28
+; CHECKFP-NEXT:    vins.f16 s17, s20
+; CHECKFP-NEXT:    vmovx.f16 s22, s8
+; CHECKFP-NEXT:    vins.f16 s22, s24
+; CHECKFP-NEXT:    vmovx.f16 s24, s14
+; CHECKFP-NEXT:    vmovx.f16 s23, s12
 ; CHECKFP-NEXT:    vins.f16 s1, s3
-; CHECKFP-NEXT:    vmovx.f16 s28, s2
-; CHECKFP-NEXT:    vmovx.f16 s24, s0
-; CHECKFP-NEXT:    vmov.f32 s20, s1
+; CHECKFP-NEXT:    vins.f16 s23, s24
+; CHECKFP-NEXT:    vmovx.f16 s24, s2
+; CHECKFP-NEXT:    vmovx.f16 s20, s0
 ; CHECKFP-NEXT:    vins.f16 s5, s7
-; CHECKFP-NEXT:    vins.f16 s24, s28
-; CHECKFP-NEXT:    vmov.f32 s21, s5
-; CHECKFP-NEXT:    vmovx.f16 s28, s6
-; CHECKFP-NEXT:    vmovx.f16 s25, s4
+; CHECKFP-NEXT:    vins.f16 s20, s24
+; CHECKFP-NEXT:    vmovx.f16 s24, s6
+; CHECKFP-NEXT:    vmovx.f16 s21, s4
+; CHECKFP-NEXT:    vins.f16 s8, s10
+; CHECKFP-NEXT:    vins.f16 s21, s24
+; CHECKFP-NEXT:    vmov.f32 s26, s9
+; CHECKFP-NEXT:    vins.f16 s12, s14
 ; CHECKFP-NEXT:    vins.f16 s0, s2
+; CHECKFP-NEXT:    vmov.f32 s27, s13
 ; CHECKFP-NEXT:    vins.f16 s4, s6
-; CHECKFP-NEXT:    vins.f16 s25, s28
+; CHECKFP-NEXT:    vmov.f32 s24, s1
+; CHECKFP-NEXT:    vmov.f32 s2, s8
+; CHECKFP-NEXT:    vmov.f32 s3, s12
 ; CHECKFP-NEXT:    vmov.f32 s1, s4
-; CHECKFP-NEXT:    vadd.f16 q1, q5, q4
-; CHECKFP-NEXT:    vmov.f32 s2, s10
-; CHECKFP-NEXT:    vmov.f32 s3, s11
-; CHECKFP-NEXT:    vadd.f16 q0, q0, q6
-; CHECKFP-NEXT:    vadd.f16 q0, q0, q1
-; CHECKFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
+; CHECKFP-NEXT:    vmov.f32 s25, s5
+; CHECKFP-NEXT:    vadd.f16 q0, q0, q5
+; CHECKFP-NEXT:    vadd.f16 q4, q6, q4
+; CHECKFP-NEXT:    vadd.f16 q0, q0, q4
+; CHECKFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECKFP-NEXT:    bx lr
 entry:
   %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
index ea079811c4bf..ce08e69b6816 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
@@ -35,15 +35,15 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_76543210(<8 x i16> %s1, <8 x i16> %s2) {
 ; CHECK-LABEL: shuffle_i16_76543210:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s3
-; CHECK-NEXT:    vins.f16 s4, s3
-; CHECK-NEXT:    vmovx.f16 s5, s2
-; CHECK-NEXT:    vins.f16 s5, s2
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vins.f16 s6, s1
-; CHECK-NEXT:    vmovx.f16 s7, s0
-; CHECK-NEXT:    vins.f16 s7, s0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmovx.f16 s0, s7
+; CHECK-NEXT:    vins.f16 s0, s7
+; CHECK-NEXT:    vmovx.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s2, s5
+; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %s1, <8 x i16> %s2, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>

diff  --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
index 7e985eb3b205..484a431a1ae1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
@@ -79,27 +79,27 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @add_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: add_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
-; CHECK-MVE-NEXT:    vadd.f16 s12, s10, s8
-; CHECK-MVE-NEXT:    vadd.f16 s8, s4, s0
-; CHECK-MVE-NEXT:    vins.f16 s8, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
+; CHECK-MVE-NEXT:    vmov q2, q0
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vadd.f16 s9, s5, s1
+; CHECK-MVE-NEXT:    vadd.f16 s12, s2, s0
+; CHECK-MVE-NEXT:    vadd.f16 s0, s4, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
 ; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vadd.f16 s1, s5, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vins.f16 s9, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vadd.f16 s2, s6, s10
 ; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vadd.f16 s10, s6, s2
-; CHECK-MVE-NEXT:    vins.f16 s10, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s3
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vadd.f16 s11, s7, s3
+; CHECK-MVE-NEXT:    vins.f16 s2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
 ; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vins.f16 s11, s12
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vadd.f16 s3, s7, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s12
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: add_float16_t:
@@ -216,27 +216,27 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @sub_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: sub_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
-; CHECK-MVE-NEXT:    vsub.f16 s12, s10, s8
-; CHECK-MVE-NEXT:    vsub.f16 s8, s4, s0
-; CHECK-MVE-NEXT:    vins.f16 s8, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
+; CHECK-MVE-NEXT:    vmov q2, q0
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vsub.f16 s9, s5, s1
+; CHECK-MVE-NEXT:    vsub.f16 s12, s2, s0
+; CHECK-MVE-NEXT:    vsub.f16 s0, s4, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
 ; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vsub.f16 s1, s5, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vins.f16 s9, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vsub.f16 s2, s6, s10
 ; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vsub.f16 s10, s6, s2
-; CHECK-MVE-NEXT:    vins.f16 s10, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s3
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vsub.f16 s11, s7, s3
+; CHECK-MVE-NEXT:    vins.f16 s2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
 ; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vins.f16 s11, s12
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vsub.f16 s3, s7, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s12
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: sub_float16_t:
@@ -336,27 +336,27 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @mul_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: mul_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
-; CHECK-MVE-NEXT:    vmul.f16 s12, s10, s8
-; CHECK-MVE-NEXT:    vmul.f16 s8, s4, s0
-; CHECK-MVE-NEXT:    vins.f16 s8, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
+; CHECK-MVE-NEXT:    vmov q2, q0
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vmul.f16 s9, s5, s1
+; CHECK-MVE-NEXT:    vmul.f16 s12, s2, s0
+; CHECK-MVE-NEXT:    vmul.f16 s0, s4, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
 ; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmul.f16 s1, s5, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vins.f16 s9, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vmul.f16 s2, s6, s10
 ; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmul.f16 s10, s6, s2
-; CHECK-MVE-NEXT:    vins.f16 s10, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s3
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vmul.f16 s11, s7, s3
+; CHECK-MVE-NEXT:    vins.f16 s2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
 ; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vins.f16 s11, s12
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vmul.f16 s3, s7, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s12
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: mul_float16_t:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
index 4e73779bec72..42a0fbc56c35 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
@@ -92,32 +92,32 @@ define arm_aapcs_vfpcc <8 x half> @foo_half_int16(<8 x i16> %src) {
 ; CHECK-MVE-LABEL: foo_half_int16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q0[1]
+; CHECK-MVE-NEXT:    vmov.s16 r0, q0[0]
 ; CHECK-MVE-NEXT:    vmov s0, r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q1[0]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s0
-; CHECK-MVE-NEXT:    vmov s0, r0
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s0, s0
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[1]
+; CHECK-MVE-NEXT:    vmov s2, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[3]
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s2
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s0, s0
 ; CHECK-MVE-NEXT:    vins.f16 s0, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[2]
 ; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
 ; CHECK-MVE-NEXT:    vmov s10, r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q1[5]
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[4]
 ; CHECK-MVE-NEXT:    vcvt.f16.s32 s1, s10
 ; CHECK-MVE-NEXT:    vins.f16 s1, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q1[4]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[5]
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s2, s8
 ; CHECK-MVE-NEXT:    vmov s10, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[7]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s2, s10
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[6]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s10, s10
 ; CHECK-MVE-NEXT:    vmov s4, r0
+; CHECK-MVE-NEXT:    vins.f16 s2, s10
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
 ; CHECK-MVE-NEXT:    vcvt.f16.s32 s3, s4
 ; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
@@ -135,32 +135,32 @@ define arm_aapcs_vfpcc <8 x half> @foo_half_uint16(<8 x i16> %src) {
 ; CHECK-MVE-LABEL: foo_half_uint16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-MVE-NEXT:    vmov.u16 r0, q0[0]
 ; CHECK-MVE-NEXT:    vmov s0, r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s0
-; CHECK-MVE-NEXT:    vmov s0, r0
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-MVE-NEXT:    vmov s2, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s2
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s0, s0
 ; CHECK-MVE-NEXT:    vins.f16 s0, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[2]
 ; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
 ; CHECK-MVE-NEXT:    vmov s10, r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[4]
 ; CHECK-MVE-NEXT:    vcvt.f16.u32 s1, s10
 ; CHECK-MVE-NEXT:    vins.f16 s1, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s2, s8
 ; CHECK-MVE-NEXT:    vmov s10, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s2, s10
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s10, s10
 ; CHECK-MVE-NEXT:    vmov s4, r0
+; CHECK-MVE-NEXT:    vins.f16 s2, s10
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
 ; CHECK-MVE-NEXT:    vcvt.f16.u32 s3, s4
 ; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
index b6ef16cb3e05..e21d3d798d35 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
@@ -205,33 +205,28 @@ entry:
 define void @vld2_v8i16_align1(<16 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld2_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.u8 q2, [r0]
-; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d2, d4
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vins.f16 s4, s9
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.f32 s5, s10
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vins.f16 s5, s11
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.f32 s6, s0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vins.f16 s6, s1
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.f32 s7, s2
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vins.f16 s7, s3
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vadd.i16 q0, q1, q3
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vmovx.f16 s5, s2
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s5, s8
+; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
+; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmovx.f16 s6, s8
+; CHECK-NEXT:    vins.f16 s6, s12
+; CHECK-NEXT:    vmovx.f16 s12, s11
+; CHECK-NEXT:    vmovx.f16 s7, s10
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vins.f16 s8, s9
+; CHECK-NEXT:    vins.f16 s7, s12
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vins.f16 s10, s11
+; CHECK-NEXT:    vmov.f32 s3, s10
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -571,15 +566,15 @@ define void @vld2_v4f16(<8 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vld2_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u16 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s4, s0
 ; CHECK-NEXT:    vins.f16 s4, s8
 ; CHECK-NEXT:    vmovx.f16 s8, s3
 ; CHECK-NEXT:    vmovx.f16 s5, s2
-; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s2, s3
 ; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r0, s0
@@ -635,33 +630,29 @@ entry:
 define void @vld2_v8f16_align1(<16 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld2_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8}
-; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    vldrb.u8 q3, [r0]
-; CHECK-NEXT:    vldrb.u8 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d0, d6
-; CHECK-NEXT:    vmovx.f16 s16, s13
-; CHECK-NEXT:    vmovx.f16 s8, s12
-; CHECK-NEXT:    vins.f16 s8, s16
-; CHECK-NEXT:    vmovx.f16 s16, s15
-; CHECK-NEXT:    vmovx.f16 s9, s14
-; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vins.f16 s0, s13
-; CHECK-NEXT:    vins.f16 s9, s16
-; CHECK-NEXT:    vmov.f32 s1, s14
-; CHECK-NEXT:    vmovx.f16 s10, s4
-; CHECK-NEXT:    vins.f16 s1, s15
-; CHECK-NEXT:    vins.f16 s10, s12
-; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmovx.f16 s12, s7
-; CHECK-NEXT:    vmovx.f16 s11, s6
-; CHECK-NEXT:    vins.f16 s2, s5
-; CHECK-NEXT:    vins.f16 s6, s7
-; CHECK-NEXT:    vins.f16 s11, s12
-; CHECK-NEXT:    vmov.f32 s3, s6
-; CHECK-NEXT:    vadd.f16 q0, q0, q2
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vmovx.f16 s5, s2
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s5, s8
+; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
+; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmovx.f16 s6, s8
+; CHECK-NEXT:    vins.f16 s6, s12
+; CHECK-NEXT:    vmovx.f16 s12, s11
+; CHECK-NEXT:    vmovx.f16 s7, s10
+; CHECK-NEXT:    vins.f16 s8, s9
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vins.f16 s10, s11
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vins.f16 s7, s12
+; CHECK-NEXT:    vmov.f32 s3, s10
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <16 x half>, <16 x half>* %src, align 1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index 159dc7017568..4cab1a4668af 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -288,63 +288,55 @@ entry:
 define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld3_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmovx.f16 s8, s6
+; CHECK-NEXT:    vmov.f32 s0, s5
+; CHECK-NEXT:    vins.f16 s0, s8
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.f32 s22, s4
-; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vins.f16 s1, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-NEXT:    vmov.f32 s2, s11
-; CHECK-NEXT:    vmov q6, q5
-; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmovnb.i32 q6, q0
-; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.f32 s2, s26
-; CHECK-NEXT:    vmov.16 q4[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.f32 s3, s23
-; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q4[5], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.f32 s19, s23
-; CHECK-NEXT:    vmovx.f16 s20, s12
-; CHECK-NEXT:    vins.f16 s20, s14
-; CHECK-NEXT:    vmovx.f16 s21, s15
+; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmov.f32 s18, s12
+; CHECK-NEXT:    vmov q5, q4
+; CHECK-NEXT:    vmovnb.i32 q5, q0
+; CHECK-NEXT:    vmov.f32 s2, s22
+; CHECK-NEXT:    vmovx.f16 s20, s5
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vmov.f64 d8, d2
+; CHECK-NEXT:    vins.f16 s16, s20
+; CHECK-NEXT:    vmovx.f16 s20, s8
+; CHECK-NEXT:    vmov.f32 s17, s7
+; CHECK-NEXT:    vins.f16 s17, s20
+; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vins.f16 s18, s20
+; CHECK-NEXT:    vmovx.f16 s20, s14
+; CHECK-NEXT:    vmov.f32 s19, s13
+; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmovx.f16 s20, s4
+; CHECK-NEXT:    vins.f16 s20, s6
+; CHECK-NEXT:    vmovx.f16 s21, s7
+; CHECK-NEXT:    vins.f16 s6, s12
+; CHECK-NEXT:    vmovx.f16 s7, s13
 ; CHECK-NEXT:    vins.f16 s21, s9
-; CHECK-NEXT:    vins.f16 s10, s4
-; CHECK-NEXT:    vmovx.f16 s11, s5
+; CHECK-NEXT:    vins.f16 s7, s15
 ; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vins.f16 s11, s7
-; CHECK-NEXT:    vmov q1, q2
-; CHECK-NEXT:    vmovnb.i32 q1, q5
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s23, s11
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vmovnb.i32 q2, q5
+; CHECK-NEXT:    vmov.f32 s22, s10
+; CHECK-NEXT:    vmov.f32 s23, s7
 ; CHECK-NEXT:    vadd.i16 q1, q4, q5
 ; CHECK-NEXT:    vadd.i16 q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <24 x i16>, <24 x i16>* %src, align 4
@@ -364,112 +356,96 @@ define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vmovx.f16 s8, s6
+; CHECK-NEXT:    vmov.f32 s0, s5
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vins.f16 s1, s12
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[6]
-; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[1]
-; CHECK-NEXT:    vmov.16 q0[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[4]
-; CHECK-NEXT:    vmov.16 q0[4], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[2]
-; CHECK-NEXT:    vmov.16 q4[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[5]
-; CHECK-NEXT:    vmov.16 q4[7], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.16 q0[5], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[4]
-; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[2]
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vmov.16 q4[0], r2
+; CHECK-NEXT:    vmov.f32 s2, s11
 ; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.16 q4[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[0]
-; CHECK-NEXT:    vmov.16 q4[2], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[3]
-; CHECK-NEXT:    vmov.16 q4[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[7]
-; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmov.f32 s22, s12
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov q6, q5
-; CHECK-NEXT:    vmovnb.i32 q6, q4
-; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vmov.f32 s19, s23
-; CHECK-NEXT:    vins.f16 s22, s12
-; CHECK-NEXT:    vmovx.f16 s23, s13
-; CHECK-NEXT:    vins.f16 s23, s15
-; CHECK-NEXT:    vmovx.f16 s12, s8
-; CHECK-NEXT:    vins.f16 s12, s10
-; CHECK-NEXT:    vmovx.f16 s13, s11
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vins.f16 s13, s5
-; CHECK-NEXT:    vmov q1, q5
-; CHECK-NEXT:    vmov.16 q3[4], r2
-; CHECK-NEXT:    vmovnb.i32 q1, q3
-; CHECK-NEXT:    vmov.f32 s14, s6
-; CHECK-NEXT:    vmov.f32 s15, s23
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vadd.i16 q0, q0, q3
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
-; CHECK-NEXT:    vmov.u16 r0, q5[4]
-; CHECK-NEXT:    vadd.i16 q0, q0, q4
-; CHECK-NEXT:    vmov.u16 r2, q3[2]
-; CHECK-NEXT:    vmov.16 q6[6], r0
-; CHECK-NEXT:    vmov.16 q1[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q3[5]
-; CHECK-NEXT:    vmov.16 q1[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[0]
-; CHECK-NEXT:    vmov.u16 r0, q5[7]
-; CHECK-NEXT:    vmov.16 q1[2], r2
-; CHECK-NEXT:    vmov.16 q6[7], r0
-; CHECK-NEXT:    vmov.u16 r2, q2[3]
-; CHECK-NEXT:    vmov.16 q1[3], r2
-; CHECK-NEXT:    vmov.f32 s26, s20
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.f32 s6, s11
+; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmov.f32 s18, s12
+; CHECK-NEXT:    vmov q5, q4
+; CHECK-NEXT:    vmovnb.i32 q5, q0
+; CHECK-NEXT:    vmov.f32 s2, s22
+; CHECK-NEXT:    vmovx.f16 s20, s5
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vmov.f64 d8, d2
+; CHECK-NEXT:    vins.f16 s16, s20
+; CHECK-NEXT:    vmovx.f16 s20, s8
+; CHECK-NEXT:    vmov.f32 s17, s7
+; CHECK-NEXT:    vins.f16 s17, s20
+; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vins.f16 s18, s20
+; CHECK-NEXT:    vmovx.f16 s20, s14
+; CHECK-NEXT:    vmov.f32 s19, s13
+; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmovx.f16 s20, s4
+; CHECK-NEXT:    vins.f16 s20, s6
+; CHECK-NEXT:    vmovx.f16 s21, s7
+; CHECK-NEXT:    vins.f16 s6, s12
+; CHECK-NEXT:    vmovx.f16 s7, s13
+; CHECK-NEXT:    vins.f16 s21, s9
+; CHECK-NEXT:    vins.f16 s7, s15
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vmovnb.i32 q2, q5
+; CHECK-NEXT:    vmov.f32 s22, s10
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s23, s7
+; CHECK-NEXT:    vadd.i16 q1, q4, q5
+; CHECK-NEXT:    vmovx.f16 s12, s10
+; CHECK-NEXT:    vadd.i16 q0, q1, q0
+; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vins.f16 s4, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmovx.f16 s0, s9
+; CHECK-NEXT:    vmovx.f16 s16, s13
+; CHECK-NEXT:    vmov.f32 s5, s12
+; CHECK-NEXT:    vins.f16 s5, s16
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s6, s15
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmovx.f16 s20, s19
+; CHECK-NEXT:    vmov.f32 s27, s18
+; CHECK-NEXT:    vins.f16 s27, s20
+; CHECK-NEXT:    vmov.f64 d10, d4
+; CHECK-NEXT:    vins.f16 s20, s0
+; CHECK-NEXT:    vmov.f32 s26, s16
+; CHECK-NEXT:    vmovx.f16 s0, s12
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vins.f16 s21, s0
 ; CHECK-NEXT:    vmov q7, q6
-; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
 ; CHECK-NEXT:    vmovnb.i32 q7, q1
-; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmovx.f16 s0, s15
+; CHECK-NEXT:    vmov.f32 s22, s14
+; CHECK-NEXT:    vins.f16 s22, s0
 ; CHECK-NEXT:    vmov.f32 s6, s30
-; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
 ; CHECK-NEXT:    vmov.f32 s7, s27
-; CHECK-NEXT:    vins.f16 s26, s20
-; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmovx.f16 s27, s21
-; CHECK-NEXT:    vmov.16 q4[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q5[2]
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q5[5]
-; CHECK-NEXT:    vins.f16 s27, s23
-; CHECK-NEXT:    vmovx.f16 s20, s12
-; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vins.f16 s20, s14
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmovx.f16 s21, s15
-; CHECK-NEXT:    vmov.16 q4[5], r0
-; CHECK-NEXT:    vins.f16 s21, s9
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.f32 s19, s3
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov q0, q6
-; CHECK-NEXT:    vmovnb.i32 q0, q5
-; CHECK-NEXT:    vmov.f32 s22, s2
-; CHECK-NEXT:    vmov.f32 s23, s27
-; CHECK-NEXT:    vadd.i16 q0, q4, q5
+; CHECK-NEXT:    vmovx.f16 s24, s8
+; CHECK-NEXT:    vmovx.f16 s0, s18
+; CHECK-NEXT:    vmov.f32 s23, s17
+; CHECK-NEXT:    vins.f16 s24, s10
+; CHECK-NEXT:    vins.f16 s23, s0
+; CHECK-NEXT:    vins.f16 s2, s16
+; CHECK-NEXT:    vmovx.f16 s25, s11
+; CHECK-NEXT:    vmovx.f16 s3, s17
+; CHECK-NEXT:    vins.f16 s25, s13
+; CHECK-NEXT:    vins.f16 s3, s19
+; CHECK-NEXT:    vmov.16 q6[4], r0
+; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vmovnb.i32 q2, q6
+; CHECK-NEXT:    vmov.f32 s26, s10
+; CHECK-NEXT:    vmov.f32 s27, s3
+; CHECK-NEXT:    vadd.i16 q0, q5, q6
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -573,56 +549,59 @@ entry:
 define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) {
 ; CHECK-LABEL: vld3_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrb.u16 q0, [r0, #16]
-; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrb.u16 q1, [r0, #16]
+; CHECK-NEXT:    vmov.u8 r2, q0[1]
+; CHECK-NEXT:    vmov.u8 r0, q0[0]
+; CHECK-NEXT:    vmov.16 q2[0], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
 ; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[13]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.16 q2[4], r2
 ; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vins.f16 s10, s4
 ; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
-; CHECK-NEXT:    vins.f16 s10, s0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmovx.f16 s16, s6
+; CHECK-NEXT:    vmov.f32 s18, s5
+; CHECK-NEXT:    vmovx.f16 s11, s5
 ; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
-; CHECK-NEXT:    vmovx.f16 s11, s1
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vins.f16 s11, s3
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vins.f16 s18, s16
+; CHECK-NEXT:    vins.f16 s11, s7
+; CHECK-NEXT:    vmov.f32 s15, s18
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
 ; CHECK-NEXT:    vadd.i16 q2, q3, q2
 ; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
 ; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
 ; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vmovx.f16 s0, s7
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vins.f16 s2, s0
 ; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov.f32 s15, s2
 ; CHECK-NEXT:    vadd.i16 q0, q2, q3
 ; CHECK-NEXT:    vstrb.16 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <24 x i8>, <24 x i8>* %src, align 4
@@ -1128,13 +1107,13 @@ define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) {
 ; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.32 q0[1], r3
 ; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vins.f16 s4, s2
-; CHECK-NEXT:    vins.f16 s0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vadd.f16 q1, q0, q1
-; CHECK-NEXT:    vins.f16 s1, s8
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vins.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vins.f16 s0, s6
+; CHECK-NEXT:    vadd.f16 q1, q0, q2
 ; CHECK-NEXT:    vmov.f32 s0, s1
 ; CHECK-NEXT:    vadd.f16 q0, q1, q0
 ; CHECK-NEXT:    vmov r0, s0
@@ -1156,28 +1135,26 @@ define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vins.f16 s4, s2
-; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    ldrd r2, r3, [r0, #16]
 ; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmovx.f16 s5, s3
-; CHECK-NEXT:    vmovx.f16 s12, s8
-; CHECK-NEXT:    vmov q4, q0
-; CHECK-NEXT:    vins.f16 s14, s12
-; CHECK-NEXT:    vmovx.f16 s12, s2
-; CHECK-NEXT:    vins.f16 s1, s12
-; CHECK-NEXT:    vmovx.f16 s12, s9
-; CHECK-NEXT:    vins.f16 s5, s9
-; CHECK-NEXT:    vmov.f32 s17, s14
-; CHECK-NEXT:    vmov.f32 s0, s1
-; CHECK-NEXT:    vins.f16 s8, s12
-; CHECK-NEXT:    vadd.f16 q1, q4, q1
+; CHECK-NEXT:    vmov.32 q2[1], r3
 ; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmovx.f16 s4, s9
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmovx.f16 s8, s8
+; CHECK-NEXT:    vmovx.f16 s12, s4
+; CHECK-NEXT:    vmovx.f16 s16, s6
+; CHECK-NEXT:    vins.f16 s12, s6
+; CHECK-NEXT:    vmovx.f16 s18, s5
+; CHECK-NEXT:    vins.f16 s5, s16
+; CHECK-NEXT:    vins.f16 s4, s18
+; CHECK-NEXT:    vmovx.f16 s13, s7
+; CHECK-NEXT:    vins.f16 s7, s8
+; CHECK-NEXT:    vmov.f32 s0, s5
+; CHECK-NEXT:    vins.f16 s13, s9
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vadd.f16 q1, q1, q3
 ; CHECK-NEXT:    vadd.f16 q0, q1, q0
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r0, s0
@@ -1198,49 +1175,49 @@ entry:
 define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld3_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmovx.f16 s4, s14
-; CHECK-NEXT:    vmov.f32 s0, s13
-; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
+; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmov.f32 s4, s1
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vmovx.f16 s8, s17
+; CHECK-NEXT:    vmov.f32 s5, s16
+; CHECK-NEXT:    vmovx.f16 s24, s1
+; CHECK-NEXT:    vins.f16 s5, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s6, s19
+; CHECK-NEXT:    vmovx.f16 s26, s16
+; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    vmov.f32 s15, s10
+; CHECK-NEXT:    vins.f16 s15, s20
+; CHECK-NEXT:    vmovx.f16 s20, s8
+; CHECK-NEXT:    vins.f16 s6, s20
 ; CHECK-NEXT:    vmovx.f16 s20, s19
-; CHECK-NEXT:    vmovx.f16 s24, s13
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vmov.f32 s1, s4
-; CHECK-NEXT:    vins.f16 s1, s8
-; CHECK-NEXT:    vmov.f32 s11, s18
-; CHECK-NEXT:    vins.f16 s11, s20
-; CHECK-NEXT:    vmov.f32 s2, s7
-; CHECK-NEXT:    vmovx.f16 s20, s16
-; CHECK-NEXT:    vmov.f32 s10, s16
-; CHECK-NEXT:    vins.f16 s2, s20
-; CHECK-NEXT:    vmov.f64 d10, d6
-; CHECK-NEXT:    vins.f16 s20, s24
-; CHECK-NEXT:    vmovx.f16 s24, s4
-; CHECK-NEXT:    vmov.f32 s21, s15
-; CHECK-NEXT:    vins.f16 s21, s24
-; CHECK-NEXT:    vmovx.f16 s24, s7
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vins.f16 s22, s24
-; CHECK-NEXT:    vmovx.f16 s24, s12
-; CHECK-NEXT:    vins.f16 s24, s14
-; CHECK-NEXT:    vmov.f32 s3, s11
-; CHECK-NEXT:    vmovx.f16 s25, s15
-; CHECK-NEXT:    vmovx.f16 s12, s18
-; CHECK-NEXT:    vins.f16 s25, s5
-; CHECK-NEXT:    vmovx.f16 s27, s17
-; CHECK-NEXT:    vins.f16 s17, s12
-; CHECK-NEXT:    vins.f16 s27, s19
-; CHECK-NEXT:    vmov.f32 s23, s17
-; CHECK-NEXT:    vmovx.f16 s26, s6
-; CHECK-NEXT:    vins.f16 s26, s16
-; CHECK-NEXT:    vadd.f16 q1, q5, q6
-; CHECK-NEXT:    vadd.f16 q0, q1, q0
+; CHECK-NEXT:    vmov.f32 s28, s18
+; CHECK-NEXT:    vmovx.f16 s30, s10
+; CHECK-NEXT:    vins.f16 s28, s20
+; CHECK-NEXT:    vmovx.f16 s20, s0
+; CHECK-NEXT:    vins.f16 s0, s24
+; CHECK-NEXT:    vins.f16 s20, s2
+; CHECK-NEXT:    vmovx.f16 s21, s3
+; CHECK-NEXT:    vins.f16 s3, s26
+; CHECK-NEXT:    vins.f16 s21, s17
+; CHECK-NEXT:    vmov.f32 s14, s8
+; CHECK-NEXT:    vmovx.f16 s23, s9
+; CHECK-NEXT:    vmov.f32 s1, s3
+; CHECK-NEXT:    vins.f16 s9, s30
+; CHECK-NEXT:    vins.f16 s23, s11
+; CHECK-NEXT:    vmovx.f16 s22, s18
+; CHECK-NEXT:    vmov.f32 s2, s28
+; CHECK-NEXT:    vins.f16 s22, s8
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov.f32 s7, s15
+; CHECK-NEXT:    vadd.f16 q0, q0, q5
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <24 x half>, <24 x half>* %src, align 4
@@ -1256,89 +1233,89 @@ entry:
 define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vld3_v16f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT:    vmovx.f16 s4, s14
-; CHECK-NEXT:    vmov.f32 s0, s13
-; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    vmovx.f16 s20, s19
-; CHECK-NEXT:    vmovx.f16 s24, s13
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vmov.f32 s1, s4
-; CHECK-NEXT:    vins.f16 s1, s8
-; CHECK-NEXT:    vmov.f32 s11, s18
-; CHECK-NEXT:    vins.f16 s11, s20
-; CHECK-NEXT:    vmov.f32 s2, s7
-; CHECK-NEXT:    vmovx.f16 s20, s16
-; CHECK-NEXT:    vmov.f32 s10, s16
-; CHECK-NEXT:    vins.f16 s2, s20
-; CHECK-NEXT:    vmov.f64 d10, d6
-; CHECK-NEXT:    vins.f16 s20, s24
-; CHECK-NEXT:    vmovx.f16 s24, s4
-; CHECK-NEXT:    vmov.f32 s21, s15
-; CHECK-NEXT:    vins.f16 s21, s24
-; CHECK-NEXT:    vmovx.f16 s24, s7
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vins.f16 s22, s24
-; CHECK-NEXT:    vmovx.f16 s24, s12
-; CHECK-NEXT:    vins.f16 s24, s14
-; CHECK-NEXT:    vmov.f32 s3, s11
-; CHECK-NEXT:    vmovx.f16 s25, s15
-; CHECK-NEXT:    vmovx.f16 s12, s18
-; CHECK-NEXT:    vins.f16 s25, s5
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmovx.f16 s27, s17
-; CHECK-NEXT:    vins.f16 s17, s12
-; CHECK-NEXT:    vins.f16 s27, s19
-; CHECK-NEXT:    vmov.f32 s23, s17
-; CHECK-NEXT:    vmovx.f16 s26, s6
-; CHECK-NEXT:    vins.f16 s26, s16
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
-; CHECK-NEXT:    vadd.f16 q1, q5, q6
-; CHECK-NEXT:    vadd.f16 q0, q1, q0
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s12, s18
-; CHECK-NEXT:    vmov.f32 s0, s17
-; CHECK-NEXT:    vmovx.f16 s20, s7
-; CHECK-NEXT:    vins.f16 s0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s9
-; CHECK-NEXT:    vmov.f32 s1, s8
-; CHECK-NEXT:    vmovx.f16 s24, s17
-; CHECK-NEXT:    vins.f16 s1, s12
-; CHECK-NEXT:    vmov.f32 s15, s6
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
+; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmov.f32 s4, s1
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vmovx.f16 s8, s17
+; CHECK-NEXT:    vmov.f32 s5, s16
+; CHECK-NEXT:    vmovx.f16 s24, s1
+; CHECK-NEXT:    vins.f16 s5, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s6, s19
+; CHECK-NEXT:    vmovx.f16 s26, s16
+; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    vmov.f32 s15, s10
 ; CHECK-NEXT:    vins.f16 s15, s20
-; CHECK-NEXT:    vmov.f32 s2, s11
-; CHECK-NEXT:    vmovx.f16 s20, s4
-; CHECK-NEXT:    vmov.f32 s14, s4
-; CHECK-NEXT:    vins.f16 s2, s20
-; CHECK-NEXT:    vmov.f64 d10, d8
-; CHECK-NEXT:    vins.f16 s20, s24
-; CHECK-NEXT:    vmovx.f16 s24, s8
-; CHECK-NEXT:    vmov.f32 s21, s19
-; CHECK-NEXT:    vins.f16 s21, s24
-; CHECK-NEXT:    vmovx.f16 s24, s11
-; CHECK-NEXT:    vmov.f32 s22, s10
-; CHECK-NEXT:    vins.f16 s22, s24
-; CHECK-NEXT:    vmovx.f16 s24, s16
-; CHECK-NEXT:    vins.f16 s24, s18
-; CHECK-NEXT:    vmov.f32 s3, s15
-; CHECK-NEXT:    vmovx.f16 s25, s19
-; CHECK-NEXT:    vmovx.f16 s16, s6
-; CHECK-NEXT:    vins.f16 s25, s9
-; CHECK-NEXT:    vmovx.f16 s27, s5
+; CHECK-NEXT:    vmovx.f16 s20, s8
+; CHECK-NEXT:    vins.f16 s6, s20
+; CHECK-NEXT:    vmovx.f16 s20, s19
+; CHECK-NEXT:    vmov.f32 s28, s18
+; CHECK-NEXT:    vmovx.f16 s30, s10
+; CHECK-NEXT:    vins.f16 s28, s20
+; CHECK-NEXT:    vmovx.f16 s20, s0
+; CHECK-NEXT:    vins.f16 s0, s24
+; CHECK-NEXT:    vins.f16 s20, s2
+; CHECK-NEXT:    vmovx.f16 s21, s3
+; CHECK-NEXT:    vins.f16 s3, s26
+; CHECK-NEXT:    vins.f16 s21, s17
+; CHECK-NEXT:    vmov.f32 s14, s8
+; CHECK-NEXT:    vmovx.f16 s23, s9
+; CHECK-NEXT:    vmov.f32 s1, s3
+; CHECK-NEXT:    vins.f16 s9, s30
+; CHECK-NEXT:    vins.f16 s23, s11
+; CHECK-NEXT:    vmovx.f16 s22, s18
+; CHECK-NEXT:    vmov.f32 s2, s28
+; CHECK-NEXT:    vins.f16 s22, s8
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov.f32 s7, s15
+; CHECK-NEXT:    vadd.f16 q0, q0, q5
+; CHECK-NEXT:    vadd.f16 q1, q0, q1
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmovx.f16 s16, s2
+; CHECK-NEXT:    vmov.f32 s4, s1
+; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    vins.f16 s4, s16
+; CHECK-NEXT:    vmovx.f16 s16, s13
+; CHECK-NEXT:    vmov.f32 s5, s12
+; CHECK-NEXT:    vmovx.f16 s24, s1
 ; CHECK-NEXT:    vins.f16 s5, s16
-; CHECK-NEXT:    vins.f16 s27, s7
-; CHECK-NEXT:    vmov.f32 s23, s5
-; CHECK-NEXT:    vmovx.f16 s26, s10
-; CHECK-NEXT:    vins.f16 s26, s4
-; CHECK-NEXT:    vadd.f16 q1, q5, q6
-; CHECK-NEXT:    vadd.f16 q0, q1, q0
+; CHECK-NEXT:    vmov.f32 s19, s10
+; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmov.f32 s6, s15
+; CHECK-NEXT:    vmovx.f16 s20, s8
+; CHECK-NEXT:    vmov.f32 s28, s14
+; CHECK-NEXT:    vins.f16 s6, s20
+; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vins.f16 s28, s20
+; CHECK-NEXT:    vmovx.f16 s20, s0
+; CHECK-NEXT:    vins.f16 s0, s24
+; CHECK-NEXT:    vins.f16 s20, s2
+; CHECK-NEXT:    vmovx.f16 s21, s3
+; CHECK-NEXT:    vmovx.f16 s26, s12
+; CHECK-NEXT:    vins.f16 s21, s13
+; CHECK-NEXT:    vins.f16 s3, s26
+; CHECK-NEXT:    vmovx.f16 s30, s10
+; CHECK-NEXT:    vmovx.f16 s23, s9
+; CHECK-NEXT:    vmov.f32 s18, s8
+; CHECK-NEXT:    vins.f16 s9, s30
+; CHECK-NEXT:    vins.f16 s23, s11
+; CHECK-NEXT:    vmov.f32 s1, s3
+; CHECK-NEXT:    vmovx.f16 s22, s14
+; CHECK-NEXT:    vmov.f32 s2, s28
+; CHECK-NEXT:    vins.f16 s22, s8
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov.f32 s7, s19
+; CHECK-NEXT:    vadd.f16 q0, q0, q5
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <48 x half>, <48 x half>* %src, align 4

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index 66f942ad981a..aa5d933562bc 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -388,69 +388,58 @@ entry:
 define void @vld4_v8i16_align1(<32 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld4_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrb.u8 q3, [r0]
-; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
-; CHECK-NEXT:    vldrb.u8 q1, [r0, #48]
-; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.f32 s18, s1
-; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
-; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vins.f16 s18, s3
-; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.f32 s19, s5
-; CHECK-NEXT:    vmov.16 q6[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vins.f16 s19, s7
-; CHECK-NEXT:    vmov.16 q6[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.f32 s16, s13
-; CHECK-NEXT:    vmov.16 q6[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vins.f16 s16, s15
-; CHECK-NEXT:    vmov.f32 s17, s9
-; CHECK-NEXT:    vmov.u16 r0, q3[1]
-; CHECK-NEXT:    vmov.f32 s22, s26
-; CHECK-NEXT:    vins.f16 s17, s11
-; CHECK-NEXT:    vmov.f32 s23, s27
-; CHECK-NEXT:    vmov.16 q6[0], r0
-; CHECK-NEXT:    vadd.i16 q4, q4, q5
-; CHECK-NEXT:    vmov.f64 d11, d0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.16 q6[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.16 q6[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vins.f16 s22, s2
-; CHECK-NEXT:    vmov.f32 s23, s4
-; CHECK-NEXT:    vins.f16 s23, s6
-; CHECK-NEXT:    vmov.f32 s20, s12
-; CHECK-NEXT:    vins.f16 s20, s14
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.f32 s21, s8
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q6[3], r0
-; CHECK-NEXT:    vins.f16 s21, s10
-; CHECK-NEXT:    vmov.f32 s26, s14
-; CHECK-NEXT:    vmov.f32 s27, s15
-; CHECK-NEXT:    vadd.i16 q0, q5, q6
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vldrb.u8 q1, [r0, #32]
+; CHECK-NEXT:    vldrb.u8 q2, [r0, #48]
+; CHECK-NEXT:    vmovx.f16 s0, s7
+; CHECK-NEXT:    vmovx.f16 s18, s5
+; CHECK-NEXT:    vins.f16 s18, s0
+; CHECK-NEXT:    vmovx.f16 s0, s11
+; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vins.f16 s5, s7
+; CHECK-NEXT:    vins.f16 s19, s0
+; CHECK-NEXT:    vldrb.u8 q0, [r0]
+; CHECK-NEXT:    vins.f16 s9, s11
+; CHECK-NEXT:    vmovx.f16 s24, s6
+; CHECK-NEXT:    vmovx.f16 s12, s3
+; CHECK-NEXT:    vmovx.f16 s16, s1
+; CHECK-NEXT:    vins.f16 s16, s12
+; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
+; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmovx.f16 s17, s13
+; CHECK-NEXT:    vins.f16 s17, s20
+; CHECK-NEXT:    vmov.f32 s22, s5
+; CHECK-NEXT:    vmov.f32 s23, s9
+; CHECK-NEXT:    vins.f16 s13, s15
+; CHECK-NEXT:    vmov.f32 s20, s1
+; CHECK-NEXT:    vmov.f32 s21, s13
+; CHECK-NEXT:    vadd.i16 q4, q5, q4
+; CHECK-NEXT:    vmovx.f16 s22, s4
+; CHECK-NEXT:    vins.f16 s22, s24
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s24, s10
+; CHECK-NEXT:    vmovx.f16 s23, s8
+; CHECK-NEXT:    vins.f16 s8, s10
+; CHECK-NEXT:    vmov.f32 s6, s4
+; CHECK-NEXT:    vmov.f32 s7, s8
+; CHECK-NEXT:    vins.f16 s23, s24
+; CHECK-NEXT:    vmovx.f16 s24, s2
+; CHECK-NEXT:    vmovx.f16 s20, s0
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vmovx.f16 s24, s14
+; CHECK-NEXT:    vmovx.f16 s21, s12
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vmov.f32 s1, s12
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vadd.i16 q0, q0, q5
 ; CHECK-NEXT:    vadd.i16 q0, q0, q4
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x i16>, <32 x i16>* %src, align 1
@@ -1082,13 +1071,13 @@ define void @vld4_v2f16(<8 x half> *%src, <2 x half> *%dst) {
 ; CHECK-NEXT:    vmovx.f16 s8, s1
 ; CHECK-NEXT:    vins.f16 s1, s3
 ; CHECK-NEXT:    vins.f16 s8, s4
-; CHECK-NEXT:    vmov.f32 s4, s1
+; CHECK-NEXT:    vmovx.f16 s4, s2
 ; CHECK-NEXT:    vmovx.f16 s12, s0
-; CHECK-NEXT:    vadd.f16 q1, q1, q2
-; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vins.f16 s12, s8
+; CHECK-NEXT:    vins.f16 s12, s4
 ; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmov.f32 s4, s1
 ; CHECK-NEXT:    vadd.f16 q0, q0, q3
+; CHECK-NEXT:    vadd.f16 q1, q1, q2
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    str r0, [r1]
@@ -1109,36 +1098,36 @@ entry:
 define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vld4_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8}
-; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vldrh.u16 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vins.f16 s1, s3
 ; CHECK-NEXT:    vmovx.f16 s4, s3
-; CHECK-NEXT:    vmov.f32 s12, s1
+; CHECK-NEXT:    vmovx.f16 s8, s1
 ; CHECK-NEXT:    vins.f16 s8, s4
 ; CHECK-NEXT:    vldrh.u16 q1, [r0, #16]
-; CHECK-NEXT:    vmovx.f16 s16, s7
-; CHECK-NEXT:    vmovx.f16 s9, s5
-; CHECK-NEXT:    vins.f16 s5, s7
-; CHECK-NEXT:    vins.f16 s9, s16
-; CHECK-NEXT:    vmov.f32 s13, s5
+; CHECK-NEXT:    vins.f16 s1, s3
 ; CHECK-NEXT:    vmovx.f16 s16, s2
-; CHECK-NEXT:    vadd.f16 q2, q3, q2
+; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vmovx.f16 s9, s5
+; CHECK-NEXT:    vins.f16 s9, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vins.f16 s12, s16
-; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vins.f16 s5, s7
 ; CHECK-NEXT:    vmovx.f16 s16, s6
 ; CHECK-NEXT:    vmovx.f16 s13, s4
-; CHECK-NEXT:    vins.f16 s4, s6
 ; CHECK-NEXT:    vins.f16 s13, s16
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmov.f32 s16, s1
 ; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s17, s5
 ; CHECK-NEXT:    vadd.f16 q0, q0, q3
+; CHECK-NEXT:    vadd.f16 q2, q4, q2
 ; CHECK-NEXT:    vadd.f16 q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s1
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    strd r0, r2, [r1]
-; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <16 x half>, <16 x half>* %src, align 2
@@ -1233,58 +1222,56 @@ entry:
 define void @vld4_v8f16_align1(<32 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld4_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    vldrb.u8 q2, [r0, #32]
-; CHECK-NEXT:    vldrb.u8 q3, [r0, #48]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
+; CHECK-NEXT:    vldrb.u8 q2, [r0, #48]
+; CHECK-NEXT:    vmovx.f16 s4, s3
+; CHECK-NEXT:    vmovx.f16 s18, s1
+; CHECK-NEXT:    vins.f16 s18, s4
 ; CHECK-NEXT:    vmovx.f16 s4, s11
-; CHECK-NEXT:    vmovx.f16 s2, s9
-; CHECK-NEXT:    vins.f16 s2, s4
-; CHECK-NEXT:    vmovx.f16 s4, s15
-; CHECK-NEXT:    vmovx.f16 s3, s13
-; CHECK-NEXT:    vins.f16 s9, s11
-; CHECK-NEXT:    vins.f16 s3, s4
+; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vins.f16 s19, s4
 ; CHECK-NEXT:    vldrb.u8 q1, [r0]
-; CHECK-NEXT:    vmovx.f16 s28, s10
-; CHECK-NEXT:    vmovx.f16 s26, s8
-; CHECK-NEXT:    vmovx.f16 s16, s7
-; CHECK-NEXT:    vmovx.f16 s0, s5
-; CHECK-NEXT:    vins.f16 s0, s16
-; CHECK-NEXT:    vldrb.u8 q4, [r0, #16]
+; CHECK-NEXT:    vmovx.f16 s24, s2
+; CHECK-NEXT:    vins.f16 s9, s11
+; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vmovx.f16 s16, s5
+; CHECK-NEXT:    vins.f16 s16, s12
+; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
+; CHECK-NEXT:    vins.f16 s5, s7
+; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmovx.f16 s17, s13
+; CHECK-NEXT:    vins.f16 s17, s20
+; CHECK-NEXT:    vmovx.f16 s22, s0
+; CHECK-NEXT:    vins.f16 s22, s24
+; CHECK-NEXT:    vmovx.f16 s24, s10
+; CHECK-NEXT:    vmovx.f16 s23, s8
 ; CHECK-NEXT:    vins.f16 s13, s15
-; CHECK-NEXT:    vins.f16 s26, s28
-; CHECK-NEXT:    vmovx.f16 s20, s19
-; CHECK-NEXT:    vmovx.f16 s1, s17
-; CHECK-NEXT:    vins.f16 s1, s20
-; CHECK-NEXT:    vmov.f32 s22, s9
+; CHECK-NEXT:    vins.f16 s23, s24
+; CHECK-NEXT:    vmovx.f16 s24, s6
+; CHECK-NEXT:    vmovx.f16 s20, s4
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vmovx.f16 s24, s14
+; CHECK-NEXT:    vmovx.f16 s21, s12
 ; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vmov.f32 s23, s13
-; CHECK-NEXT:    vmovx.f16 s28, s14
-; CHECK-NEXT:    vmovx.f16 s27, s12
-; CHECK-NEXT:    vmov.f32 s10, s8
-; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vmov.f32 s11, s12
-; CHECK-NEXT:    vins.f16 s27, s28
-; CHECK-NEXT:    vins.f16 s5, s7
-; CHECK-NEXT:    vmovx.f16 s28, s6
-; CHECK-NEXT:    vmovx.f16 s24, s4
-; CHECK-NEXT:    vmov.f32 s20, s5
-; CHECK-NEXT:    vins.f16 s17, s19
-; CHECK-NEXT:    vins.f16 s24, s28
-; CHECK-NEXT:    vmov.f32 s21, s17
-; CHECK-NEXT:    vmovx.f16 s28, s18
-; CHECK-NEXT:    vmovx.f16 s25, s16
+; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vmov.f32 s26, s1
 ; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vins.f16 s16, s18
-; CHECK-NEXT:    vins.f16 s25, s28
-; CHECK-NEXT:    vmov.f32 s5, s16
-; CHECK-NEXT:    vadd.f16 q0, q5, q0
-; CHECK-NEXT:    vmov.f32 s6, s10
-; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vadd.f16 q1, q1, q6
-; CHECK-NEXT:    vadd.f16 q0, q1, q0
+; CHECK-NEXT:    vmov.f32 s27, s9
+; CHECK-NEXT:    vmov.f32 s24, s5
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s7, s8
+; CHECK-NEXT:    vmov.f32 s25, s13
+; CHECK-NEXT:    vmov.f32 s5, s12
+; CHECK-NEXT:    vadd.f16 q4, q6, q4
+; CHECK-NEXT:    vadd.f16 q0, q1, q5
+; CHECK-NEXT:    vadd.f16 q0, q0, q4
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x half>, <32 x half>* %src, align 1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
index 350b3b01472a..08449567adaa 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
@@ -23,104 +23,104 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q3, [r0, #32]
+; CHECK-NEXT:    vldrh.u16 q4, [r0, #32]
 ; CHECK-NEXT:    vldrh.u16 q5, [r0, #48]
-; CHECK-NEXT:    vldrh.u16 q4, [r0], #64
-; CHECK-NEXT:    vmovx.f16 s4, s15
-; CHECK-NEXT:    vmovx.f16 s2, s13
-; CHECK-NEXT:    vins.f16 s2, s4
-; CHECK-NEXT:    vmovx.f16 s4, s23
-; CHECK-NEXT:    vmovx.f16 s3, s21
-; CHECK-NEXT:    vldrh.u16 q7, [r0, #-48]
-; CHECK-NEXT:    vins.f16 s3, s4
-; CHECK-NEXT:    vmovx.f16 s4, s19
-; CHECK-NEXT:    vmovx.f16 s0, s17
-; CHECK-NEXT:    vins.f16 s13, s15
-; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vmovx.f16 s4, s31
-; CHECK-NEXT:    vmovx.f16 s1, s29
-; CHECK-NEXT:    vins.f16 s21, s23
-; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:    vins.f16 s17, s19
-; CHECK-NEXT:    vmul.f16 q1, q0, r2
-; CHECK-NEXT:    vmov.f32 s2, s13
+; CHECK-NEXT:    vldrh.u16 q3, [r0], #64
+; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vmovx.f16 s8, s19
+; CHECK-NEXT:    vldrh.u16 q6, [r0, #-48]
+; CHECK-NEXT:    vins.f16 s2, s19
 ; CHECK-NEXT:    vmov.f32 s3, s21
-; CHECK-NEXT:    vins.f16 s29, s31
-; CHECK-NEXT:    vmov.f32 s0, s17
-; CHECK-NEXT:    vmov.f32 s1, s29
-; CHECK-NEXT:    vmul.f16 q2, q0, r2
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vmovx.f16 s27, s8
-; CHECK-NEXT:    vins.f16 s8, s4
-; CHECK-NEXT:    vmov.f32 s25, s8
-; CHECK-NEXT:    vmovx.f16 s4, s7
-; CHECK-NEXT:    vins.f16 s27, s0
-; CHECK-NEXT:    vmovx.f16 s2, s12
-; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmovx.f16 s24, s14
-; CHECK-NEXT:    vins.f16 s2, s24
-; CHECK-NEXT:    vmovx.f16 s24, s22
-; CHECK-NEXT:    vmovx.f16 s3, s20
-; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vins.f16 s3, s24
-; CHECK-NEXT:    vmovx.f16 s24, s18
-; CHECK-NEXT:    vmovx.f16 s0, s16
-; CHECK-NEXT:    vins.f16 s20, s22
-; CHECK-NEXT:    vins.f16 s0, s24
-; CHECK-NEXT:    vmovx.f16 s24, s30
-; CHECK-NEXT:    vmovx.f16 s1, s28
+; CHECK-NEXT:    vmovx.f16 s5, s25
+; CHECK-NEXT:    vins.f16 s3, s23
+; CHECK-NEXT:    vmovx.f16 s6, s17
+; CHECK-NEXT:    vmov.f32 s0, s13
+; CHECK-NEXT:    vins.f16 s6, s8
+; CHECK-NEXT:    vmovx.f16 s8, s23
+; CHECK-NEXT:    vmovx.f16 s7, s21
+; CHECK-NEXT:    vins.f16 s0, s15
+; CHECK-NEXT:    vins.f16 s7, s8
+; CHECK-NEXT:    vmovx.f16 s8, s15
+; CHECK-NEXT:    vmovx.f16 s4, s13
+; CHECK-NEXT:    vins.f16 s25, s27
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vmovx.f16 s8, s27
+; CHECK-NEXT:    vins.f16 s5, s8
+; CHECK-NEXT:    vmov.f32 s1, s25
+; CHECK-NEXT:    vmul.f16 q2, q1, r2
+; CHECK-NEXT:    vmul.f16 q0, q0, r2
+; CHECK-NEXT:    vmovx.f16 s7, s0
+; CHECK-NEXT:    vmovx.f16 s28, s8
+; CHECK-NEXT:    vins.f16 s7, s28
+; CHECK-NEXT:    vmovx.f16 s30, s16
+; CHECK-NEXT:    vmovx.f16 s31, s20
+; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmovx.f16 s28, s12
 ; CHECK-NEXT:    vins.f16 s16, s18
-; CHECK-NEXT:    vins.f16 s1, s24
-; CHECK-NEXT:    vins.f16 s28, s30
-; CHECK-NEXT:    vmul.f16 q6, q0, r2
-; CHECK-NEXT:    vmov.f32 s2, s12
-; CHECK-NEXT:    vmov.f32 s3, s20
-; CHECK-NEXT:    vmov.f32 s17, s28
-; CHECK-NEXT:    vmov.f32 s18, s2
-; CHECK-NEXT:    vmov.f32 s19, s3
-; CHECK-NEXT:    vmovx.f16 s2, s24
-; CHECK-NEXT:    vmul.f16 q5, q4, r2
-; CHECK-NEXT:    vmovx.f16 s0, s20
-; CHECK-NEXT:    vins.f16 s20, s24
-; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vmov q3, q5
-; CHECK-NEXT:    vmov.f32 s14, s0
-; CHECK-NEXT:    vmovx.f16 s0, s9
-; CHECK-NEXT:    vins.f16 s9, s5
-; CHECK-NEXT:    vmovx.f16 s2, s5
-; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov.f32 s19, s0
-; CHECK-NEXT:    vmovx.f16 s31, s10
-; CHECK-NEXT:    vmovx.f16 s18, s21
-; CHECK-NEXT:    vins.f16 s21, s25
-; CHECK-NEXT:    vins.f16 s10, s6
-; CHECK-NEXT:    vmov.f32 s16, s21
-; CHECK-NEXT:    vmovx.f16 s0, s25
-; CHECK-NEXT:    vmov.f32 s29, s10
-; CHECK-NEXT:    vins.f16 s18, s0
-; CHECK-NEXT:    vmovx.f16 s0, s6
-; CHECK-NEXT:    vins.f16 s31, s0
-; CHECK-NEXT:    vmovx.f16 s0, s26
-; CHECK-NEXT:    vmovx.f16 s30, s22
-; CHECK-NEXT:    vins.f16 s22, s26
-; CHECK-NEXT:    vmov.f32 s28, s22
-; CHECK-NEXT:    vins.f16 s30, s0
-; CHECK-NEXT:    vmovx.f16 s3, s11
-; CHECK-NEXT:    vins.f16 s11, s7
-; CHECK-NEXT:    vstrh.16 q7, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s1, s11
-; CHECK-NEXT:    vins.f16 s3, s4
-; CHECK-NEXT:    vmovx.f16 s4, s27
-; CHECK-NEXT:    vmovx.f16 s2, s23
-; CHECK-NEXT:    vins.f16 s23, s27
-; CHECK-NEXT:    vmov.f32 s0, s23
-; CHECK-NEXT:    vins.f16 s2, s4
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vstrh.16 q0, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s13, s5
-; CHECK-NEXT:    vmov.f32 s15, s7
-; CHECK-NEXT:    vstrh.16 q3, [r1], #64
+; CHECK-NEXT:    vmovx.f16 s29, s24
+; CHECK-NEXT:    vmovx.f16 s4, s18
+; CHECK-NEXT:    vins.f16 s20, s22
+; CHECK-NEXT:    vins.f16 s30, s4
+; CHECK-NEXT:    vmovx.f16 s4, s22
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vins.f16 s31, s4
+; CHECK-NEXT:    vmovx.f16 s4, s14
+; CHECK-NEXT:    vmov.f32 s14, s16
+; CHECK-NEXT:    vins.f16 s24, s26
+; CHECK-NEXT:    vmov.f32 s15, s20
+; CHECK-NEXT:    vins.f16 s28, s4
+; CHECK-NEXT:    vmovx.f16 s4, s26
+; CHECK-NEXT:    vmov.f32 s13, s24
+; CHECK-NEXT:    vins.f16 s29, s4
+; CHECK-NEXT:    vmul.f16 q3, q3, r2
+; CHECK-NEXT:    vmul.f16 q7, q7, r2
+; CHECK-NEXT:    vmovx.f16 s4, s12
+; CHECK-NEXT:    vmovx.f16 s6, s28
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vmovx.f16 s5, s9
+; CHECK-NEXT:    vins.f16 s12, s28
+; CHECK-NEXT:    vins.f16 s6, s5
+; CHECK-NEXT:    vmovx.f16 s18, s13
+; CHECK-NEXT:    vmovx.f16 s5, s29
+; CHECK-NEXT:    vins.f16 s1, s9
+; CHECK-NEXT:    vins.f16 s18, s5
+; CHECK-NEXT:    vmovx.f16 s23, s2
+; CHECK-NEXT:    vmovx.f16 s5, s10
+; CHECK-NEXT:    vins.f16 s2, s10
+; CHECK-NEXT:    vins.f16 s23, s5
+; CHECK-NEXT:    vins.f16 s13, s29
+; CHECK-NEXT:    vmovx.f16 s27, s3
+; CHECK-NEXT:    vmovx.f16 s8, s11
+; CHECK-NEXT:    vmovx.f16 s22, s14
+; CHECK-NEXT:    vins.f16 s27, s8
+; CHECK-NEXT:    vins.f16 s14, s30
+; CHECK-NEXT:    vmovx.f16 s26, s15
+; CHECK-NEXT:    vins.f16 s15, s31
+; CHECK-NEXT:    vmovx.f16 s8, s31
+; CHECK-NEXT:    vins.f16 s3, s11
+; CHECK-NEXT:    vins.f16 s26, s8
+; CHECK-NEXT:    vmov q2, q3
+; CHECK-NEXT:    vmovx.f16 s5, s30
+; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s10, s4
+; CHECK-NEXT:    vmov.f32 s29, s0
+; CHECK-NEXT:    vins.f16 s22, s5
+; CHECK-NEXT:    vmov.f32 s9, s0
+; CHECK-NEXT:    vmov.f32 s11, s31
+; CHECK-NEXT:    vmov q7, q0
+; CHECK-NEXT:    vmov.f32 s31, s6
+; CHECK-NEXT:    vmov.f32 s16, s13
+; CHECK-NEXT:    vmov.f32 s21, s2
+; CHECK-NEXT:    vmov.f32 s25, s3
+; CHECK-NEXT:    vmov.f32 s17, s29
+; CHECK-NEXT:    vmov.f32 s20, s14
+; CHECK-NEXT:    vmov.f32 s24, s15
+; CHECK-NEXT:    vstrh.16 q5, [r1, #32]
+; CHECK-NEXT:    vstrh.16 q6, [r1, #48]
+; CHECK-NEXT:    vstrh.16 q2, [r1], #64
+; CHECK-NEXT:    vmov.f32 s19, s31
 ; CHECK-NEXT:    vstrh.16 q4, [r1, #-48]
 ; CHECK-NEXT:    le lr, .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: @ %while.end

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
index 53cf18ce04f1..bd9b8a0bfa15 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
@@ -207,34 +207,34 @@ entry:
 define void @vst2_v8i16_align1(<8 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-LABEL: vst2_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d3
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vins.f16 s8, s2
-; CHECK-NEXT:    vmov.f64 d6, d2
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vins.f16 s12, s0
-; CHECK-NEXT:    vins.f16 s10, s3
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.f32 s14, s5
-; CHECK-NEXT:    vstrb.8 q2, [r1, #16]
-; CHECK-NEXT:    vins.f16 s14, s1
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vstrb.8 q3, [r1]
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmovx.f16 s1, s10
+; CHECK-NEXT:    vmovx.f16 s12, s6
+; CHECK-NEXT:    vins.f16 s1, s12
+; CHECK-NEXT:    vins.f16 s10, s6
+; CHECK-NEXT:    vmov.f32 s0, s10
+; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vmovx.f16 s3, s11
+; CHECK-NEXT:    vins.f16 s11, s7
+; CHECK-NEXT:    vmov.f32 s2, s11
+; CHECK-NEXT:    vmovx.f16 s14, s4
+; CHECK-NEXT:    vins.f16 s3, s12
+; CHECK-NEXT:    vmovx.f16 s12, s8
+; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vmov.f32 s17, s12
+; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
+; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vins.f16 s9, s5
+; CHECK-NEXT:    vmov.f32 s18, s9
+; CHECK-NEXT:    vins.f16 s19, s4
+; CHECK-NEXT:    vstrb.8 q4, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
@@ -520,10 +520,10 @@ define void @vst2_v2f16(<2 x half> *%src, <4 x half> *%dst) {
 ; CHECK-NEXT:    vins.f16 s4, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s0
 ; CHECK-NEXT:    vins.f16 s5, s0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    str r0, [r1, #4]
 ; CHECK-NEXT:    vmov r0, s4
-; CHECK-NEXT:    vmov r2, s5
 ; CHECK-NEXT:    str r0, [r1]
-; CHECK-NEXT:    str r2, [r1, #4]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
@@ -540,22 +540,23 @@ define void @vst2_v4f16(<4 x half> *%src, <8 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    ldrd r2, r12, [r0]
 ; CHECK-NEXT:    ldrd r3, r0, [r0, #8]
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov.32 q1[1], r12
-; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmovx.f16 s12, s4
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    vmov.32 q1[0], r3
+; CHECK-NEXT:    vmov.32 q0[1], r12
+; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vins.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s12, s8
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmov.f32 s9, s12
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vmovx.f16 s11, s5
-; CHECK-NEXT:    vins.f16 s5, s1
-; CHECK-NEXT:    vmov.f32 s10, s5
-; CHECK-NEXT:    vins.f16 s11, s0
-; CHECK-NEXT:    vstrh.16 q2, [r1]
+; CHECK-NEXT:    vmovx.f16 s10, s4
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vins.f16 s8, s10
+; CHECK-NEXT:    vmovx.f16 s10, s1
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vins.f16 s1, s5
+; CHECK-NEXT:    vins.f16 s10, s4
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov.f32 s5, s8
+; CHECK-NEXT:    vmov.f32 s6, s1
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vstrh.16 q1, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
@@ -610,30 +611,31 @@ entry:
 define void @vst2_v8f16_align1(<8 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vst2_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmovx.f16 s9, s6
-; CHECK-NEXT:    vins.f16 s6, s2
-; CHECK-NEXT:    vmov.f32 s8, s6
-; CHECK-NEXT:    vmovx.f16 s12, s2
-; CHECK-NEXT:    vins.f16 s9, s12
-; CHECK-NEXT:    vmovx.f16 s12, s3
-; CHECK-NEXT:    vmovx.f16 s11, s7
-; CHECK-NEXT:    vins.f16 s7, s3
-; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vins.f16 s11, s12
+; CHECK-NEXT:    vmovx.f16 s1, s6
+; CHECK-NEXT:    vmovx.f16 s12, s10
+; CHECK-NEXT:    vins.f16 s1, s12
+; CHECK-NEXT:    vins.f16 s6, s10
+; CHECK-NEXT:    vmovx.f16 s3, s7
+; CHECK-NEXT:    vmovx.f16 s12, s11
+; CHECK-NEXT:    vins.f16 s7, s11
+; CHECK-NEXT:    vins.f16 s3, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vstrb.8 q2, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vins.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s12, s8
+; CHECK-NEXT:    vmovx.f16 s14, s8
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vmovx.f16 s14, s5
+; CHECK-NEXT:    vins.f16 s5, s9
+; CHECK-NEXT:    vmovx.f16 s8, s9
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vins.f16 s14, s8
 ; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmovx.f16 s0, s1
 ; CHECK-NEXT:    vmov.f32 s9, s12
-; CHECK-NEXT:    vmovx.f16 s11, s5
-; CHECK-NEXT:    vins.f16 s5, s1
 ; CHECK-NEXT:    vmov.f32 s10, s5
-; CHECK-NEXT:    vins.f16 s11, s0
+; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
 ; CHECK-NEXT:    vstrb.8 q2, [r1]
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 2bdab7e50e1e..c1367ea819a9 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -347,77 +347,64 @@ entry:
 define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) {
 ; CHECK-LABEL: vst3_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vmov.f64 d0, d6
 ; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmovx.f16 s20, s12
 ; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vmov.f32 s17, s12
+; CHECK-NEXT:    vmov.f32 s8, s13
+; CHECK-NEXT:    vins.f16 s8, s5
 ; CHECK-NEXT:    vmov.16 q0[4], r2
-; CHECK-NEXT:    vmov.f32 s18, s12
-; CHECK-NEXT:    vmov.f32 s3, s9
-; CHECK-NEXT:    vmov.u16 r0, q4[2]
-; CHECK-NEXT:    vins.f16 s3, s5
-; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.f32 s1, s8
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q5[3], r2
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[5]
-; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov.f32 s1, s21
+; CHECK-NEXT:    vmov.f32 s3, s8
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s1, s12
+; CHECK-NEXT:    vmov.f32 s17, s8
+; CHECK-NEXT:    vmov.f32 s18, s8
+; CHECK-NEXT:    vins.f16 s17, s20
+; CHECK-NEXT:    vmovx.f16 s20, s18
+; CHECK-NEXT:    vins.f16 s2, s20
+; CHECK-NEXT:    vmovx.f16 s20, s10
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s1, s17
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vmovx.f16 s16, s6
+; CHECK-NEXT:    vins.f16 s16, s20
+; CHECK-NEXT:    vmovx.f16 s20, s11
 ; CHECK-NEXT:    vins.f16 s17, s7
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vmov.f32 s2, s22
-; CHECK-NEXT:    vmov.16 q4[6], r0
-; CHECK-NEXT:    vmov.f32 s21, s11
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vmov.f32 s22, s11
-; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vmov.u16 r2, q5[2]
-; CHECK-NEXT:    vmov.u16 r0, q4[3]
-; CHECK-NEXT:    vmov.16 q6[2], r2
-; CHECK-NEXT:    vmov.16 q6[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[4]
-; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q5[5]
-; CHECK-NEXT:    vmov.16 q6[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
-; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q5[7], r0
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vmovx.f16 s19, s7
 ; CHECK-NEXT:    vrev32.16 q1, q1
-; CHECK-NEXT:    vmov.f32 s21, s13
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov.f32 s22, s10
-; CHECK-NEXT:    vmov.16 q2[2], r2
-; CHECK-NEXT:    vmov.u16 r0, q5[3]
-; CHECK-NEXT:    vmov.f32 s17, s25
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q5[4]
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vmov.f32 s21, s9
+; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmov.f32 s21, s15
+; CHECK-NEXT:    vmov.f32 s18, s11
+; CHECK-NEXT:    vmovx.f16 s24, s17
+; CHECK-NEXT:    vmov.f32 s22, s15
+; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vmovx.f16 s24, s22
+; CHECK-NEXT:    vins.f16 s18, s24
+; CHECK-NEXT:    vmov.f32 s8, s9
+; CHECK-NEXT:    vmov.f32 s22, s18
+; CHECK-NEXT:    vmov.f32 s17, s21
+; CHECK-NEXT:    vmov.f32 s18, s22
+; CHECK-NEXT:    vmovx.f16 s20, s13
+; CHECK-NEXT:    vins.f16 s8, s20
+; CHECK-NEXT:    vmovx.f16 s20, s14
+; CHECK-NEXT:    vins.f16 s10, s20
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s22, s10
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q5, [r1, #16]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vmov.f32 s11, s10
+; CHECK-NEXT:    vmov.f32 s10, s14
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vins.f16 s5, s12
+; CHECK-NEXT:    vmovx.f16 s12, s6
+; CHECK-NEXT:    vins.f16 s10, s12
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s9, s5
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
@@ -438,164 +425,132 @@ define void @vst3_v16i16(<16 x i16> *%src, <48 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #112
-; CHECK-NEXT:    sub sp, #112
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
-; CHECK-NEXT:    vmov.f64 d12, d2
-; CHECK-NEXT:    vmov.u16 r2, q0[1]
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q2, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vins.f16 s24, s0
-; CHECK-NEXT:    vmov.16 q6[4], r2
-; CHECK-NEXT:    vmov.f32 s27, s5
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
-; CHECK-NEXT:    vins.f16 s27, s1
-; CHECK-NEXT:    vmov.f32 s13, s4
-; CHECK-NEXT:    vstrw.32 q1, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s14, s4
-; CHECK-NEXT:    vmov.f32 s25, s8
-; CHECK-NEXT:    vmov.u16 r3, q3[2]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.u16 r2, q6[3]
-; CHECK-NEXT:    vmov.16 q3[2], r3
-; CHECK-NEXT:    vmov.16 q3[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q6[4]
-; CHECK-NEXT:    vmov.16 q3[4], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    vmov.16 q5[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.16 q5[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
-; CHECK-NEXT:    vins.f16 s21, s3
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vmov.f32 s9, s3
-; CHECK-NEXT:    vmov.16 q5[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov.16 q5[7], r2
-; CHECK-NEXT:    vmov.u16 r3, q2[2]
-; CHECK-NEXT:    vmov.f32 s22, s7
-; CHECK-NEXT:    vmov.16 q1[2], r3
-; CHECK-NEXT:    vmov.u16 r2, q5[3]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT:    vmov.16 q1[3], r2
-; CHECK-NEXT:    vmov.u16 r2, q5[4]
-; CHECK-NEXT:    vmov.16 q1[4], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmov.16 q1[5], r2
-; CHECK-NEXT:    vmov.u16 r2, q0[1]
-; CHECK-NEXT:    vmov.f32 s21, s5
-; CHECK-NEXT:    vmov.f64 d8, d4
-; CHECK-NEXT:    vstrw.32 q2, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vins.f16 s16, s0
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.16 q4[4], r2
-; CHECK-NEXT:    vstrw.32 q5, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s19, s9
-; CHECK-NEXT:    vins.f16 s19, s1
-; CHECK-NEXT:    vmov.f32 s17, s8
-; CHECK-NEXT:    vmov.f32 s9, s28
-; CHECK-NEXT:    vmov.u16 r0, q4[3]
-; CHECK-NEXT:    vmov.f32 s10, s28
-; CHECK-NEXT:    vmov.u16 r2, q2[2]
-; CHECK-NEXT:    vmov.16 q1[2], r2
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[4]
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.f32 s17, s5
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.f32 s18, s6
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[5]
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
-; CHECK-NEXT:    vins.f16 s9, s3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[7]
-; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.f32 s6, s3
-; CHECK-NEXT:    vstrw.32 q4, [r1]
-; CHECK-NEXT:    vmov.f32 s10, s31
-; CHECK-NEXT:    vmov.u16 r2, q1[2]
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.16 q0[2], r2
-; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.f32 s25, s13
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s26, s14
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vrev32.16 q1, q1
-; CHECK-NEXT:    vmov.u16 r0, q7[2]
+; CHECK-NEXT:    .pad #80
+; CHECK-NEXT:    sub sp, #80
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #64]
+; CHECK-NEXT:    vmovx.f16 s0, s14
+; CHECK-NEXT:    vmovx.f16 s8, s6
+; CHECK-NEXT:    vins.f16 s8, s0
+; CHECK-NEXT:    vmovx.f16 s0, s15
+; CHECK-NEXT:    vins.f16 s9, s7
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.f32 s9, s1
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[4]
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vmov.f32 s5, s29
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s6, s14
-; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
-; CHECK-NEXT:    vmov.u16 r2, q7[2]
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q3[2], r2
-; CHECK-NEXT:    vstrw.32 q6, [r1, #48]
-; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[5]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s5, s13
-; CHECK-NEXT:    vmov.f32 s6, s14
-; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q1, [r1, #64]
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[3]
-; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
-; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[5]
-; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vmov.f32 s1, s13
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s30
-; CHECK-NEXT:    vrev32.16 q3, q3
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmov.u16 r2, q3[2]
-; CHECK-NEXT:    vmov.16 q7[2], r2
-; CHECK-NEXT:    vmov.16 q7[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmovx.f16 s11, s7
+; CHECK-NEXT:    vmov.u16 r2, q6[1]
+; CHECK-NEXT:    vins.f16 s11, s0
+; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s10, s15
+; CHECK-NEXT:    vmovx.f16 s4, s9
+; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s1, s11
+; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s2, s11
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vins.f16 s18, s4
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vmov.f64 d4, d2
+; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vmovx.f16 s28, s4
+; CHECK-NEXT:    vins.f16 s8, s24
+; CHECK-NEXT:    vmov.f32 s17, s1
+; CHECK-NEXT:    vmov.16 q2[4], r2
+; CHECK-NEXT:    vmov.f32 s11, s5
+; CHECK-NEXT:    vins.f16 s11, s25
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s9, s4
+; CHECK-NEXT:    vstrw.32 q4, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s5, s20
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s6, s20
+; CHECK-NEXT:    vins.f16 s5, s28
+; CHECK-NEXT:    vmovx.f16 s28, s6
+; CHECK-NEXT:    vins.f16 s10, s28
+; CHECK-NEXT:    vmov.f64 d14, d8
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s9, s5
+; CHECK-NEXT:    vmov.f32 s0, s17
+; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q2, [r1]
+; CHECK-NEXT:    vins.f16 s28, s4
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
+; CHECK-NEXT:    vins.f16 s0, s5
 ; CHECK-NEXT:    vmov.16 q7[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.16 q7[5], r0
-; CHECK-NEXT:    vmov.f32 s1, s29
+; CHECK-NEXT:    vmov.f32 s31, s0
+; CHECK-NEXT:    vmovx.f16 s4, s16
+; CHECK-NEXT:    vmov.f32 s1, s12
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vmov.f32 s29, s16
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s30, s4
+; CHECK-NEXT:    vmovx.f16 s4, s22
 ; CHECK-NEXT:    vmov.f32 s2, s30
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    add sp, #112
+; CHECK-NEXT:    vmov.f32 s29, s1
+; CHECK-NEXT:    vmov.f32 s12, s13
+; CHECK-NEXT:    vmov.f32 s30, s2
+; CHECK-NEXT:    vmovx.f16 s0, s26
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s23
+; CHECK-NEXT:    vins.f16 s1, s27
+; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
+; CHECK-NEXT:    vmovx.f16 s3, s27
+; CHECK-NEXT:    vins.f16 s3, s4
+; CHECK-NEXT:    vmov.f32 s5, s19
+; CHECK-NEXT:    vmov.f32 s2, s23
+; CHECK-NEXT:    vmovx.f16 s24, s1
+; CHECK-NEXT:    vmov.f32 s6, s19
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s5, s24
+; CHECK-NEXT:    vmovx.f16 s24, s6
+; CHECK-NEXT:    vins.f16 s2, s24
+; CHECK-NEXT:    vmovx.f16 s24, s17
+; CHECK-NEXT:    vmov.f32 s6, s2
+; CHECK-NEXT:    vins.f16 s12, s24
+; CHECK-NEXT:    vmovx.f16 s24, s18
+; CHECK-NEXT:    vmov.f32 s1, s5
+; CHECK-NEXT:    vins.f16 s14, s24
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s15, s14
+; CHECK-NEXT:    vmov.f32 s14, s18
+; CHECK-NEXT:    vmovx.f16 s16, s13
+; CHECK-NEXT:    vrev32.16 q6, q6
+; CHECK-NEXT:    vmov.f32 s20, s21
+; CHECK-NEXT:    vins.f16 s25, s16
+; CHECK-NEXT:    vmovx.f16 s16, s26
+; CHECK-NEXT:    vins.f16 s14, s16
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s2, s6
+; CHECK-NEXT:    vmovx.f16 s4, s17
+; CHECK-NEXT:    vmov.f32 s26, s14
+; CHECK-NEXT:    vins.f16 s20, s4
+; CHECK-NEXT:    vmovx.f16 s4, s18
+; CHECK-NEXT:    vins.f16 s22, s4
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s23, s22
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s22, s18
+; CHECK-NEXT:    vmovx.f16 s16, s21
+; CHECK-NEXT:    vrev32.16 q1, q1
+; CHECK-NEXT:    vmov.f32 s13, s25
+; CHECK-NEXT:    vins.f16 s5, s16
+; CHECK-NEXT:    vmovx.f16 s16, s6
+; CHECK-NEXT:    vins.f16 s22, s16
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s6, s22
+; CHECK-NEXT:    vmov.f32 s21, s5
+; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s14, s26
+; CHECK-NEXT:    vstrw.32 q3, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vstrw.32 q5, [r1, #16]
+; CHECK-NEXT:    add sp, #80
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -717,68 +672,63 @@ entry:
 define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) {
 ; CHECK-LABEL: vst3_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrb.u16 q1, [r0, #8]
-; CHECK-NEXT:    vldrb.u16 q2, [r0, #16]
+; CHECK-NEXT:    .vsave {d8, d9, d10}
+; CHECK-NEXT:    vpush {d8, d9, d10}
+; CHECK-NEXT:    vldrb.u16 q1, [r0, #16]
+; CHECK-NEXT:    vldrb.u16 q2, [r0, #8]
+; CHECK-NEXT:    vmovx.f16 s12, s6
+; CHECK-NEXT:    vmovx.f16 s0, s10
+; CHECK-NEXT:    vins.f16 s0, s12
+; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vins.f16 s1, s11
+; CHECK-NEXT:    vmovx.f16 s3, s11
+; CHECK-NEXT:    vins.f16 s3, s12
 ; CHECK-NEXT:    vldrb.u16 q3, [r0]
-; CHECK-NEXT:    vmov.u16 r2, q1[5]
-; CHECK-NEXT:    vmov.16 q0[0], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
-; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vins.f16 s1, s7
+; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vmovx.f16 s20, s1
 ; CHECK-NEXT:    vmov.f32 s17, s15
-; CHECK-NEXT:    vmov.16 q0[6], r2
-; CHECK-NEXT:    vmov.u16 r2, q2[7]
-; CHECK-NEXT:    vmov.16 q0[7], r2
-; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vmov.f32 s2, s11
-; CHECK-NEXT:    vmov.u16 r0, q4[2]
-; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    vmov.16 q5[3], r2
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[5]
-; CHECK-NEXT:    vmov.16 q5[5], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vins.f16 s17, s20
+; CHECK-NEXT:    vmovx.f16 s20, s18
+; CHECK-NEXT:    vins.f16 s2, s20
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s1, s17
+; CHECK-NEXT:    vmov.f32 s2, s18
 ; CHECK-NEXT:    vmov.8 q4[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.8 q4[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vstrb.16 q0, [r1, #16]
+; CHECK-NEXT:    vmov.8 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
 ; CHECK-NEXT:    vmov.8 q4[2], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[1]
 ; CHECK-NEXT:    vmov.8 q4[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.8 q4[4], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[1]
+; CHECK-NEXT:    vmov.8 q4[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-NEXT:    vmov.8 q4[5], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[2]
 ; CHECK-NEXT:    vmov.8 q4[6], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.8 q4[7], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[2]
+; CHECK-NEXT:    vmov.8 q4[7], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
 ; CHECK-NEXT:    vmov.8 q4[8], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[3]
 ; CHECK-NEXT:    vmov.8 q4[9], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.8 q4[10], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.8 q4[10], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
 ; CHECK-NEXT:    vmov.8 q4[11], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[4]
 ; CHECK-NEXT:    vmov.8 q4[12], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-NEXT:    vmov.8 q4[13], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[4]
-; CHECK-NEXT:    vmov.f32 s1, s21
+; CHECK-NEXT:    vmov.8 q4[13], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
 ; CHECK-NEXT:    vmov.8 q4[14], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.f32 s2, s22
 ; CHECK-NEXT:    vmov.8 q4[15], r0
-; CHECK-NEXT:    vstrb.16 q0, [r1, #16]
 ; CHECK-NEXT:    vstrw.32 q4, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
@@ -1355,19 +1305,19 @@ define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldmia r0, {s0, s1}
 ; CHECK-NEXT:    ldr r0, [r0, #8]
-; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s8, s0
 ; CHECK-NEXT:    vins.f16 s0, s1
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vmovx.f16 s6, s8
-; CHECK-NEXT:    vins.f16 s8, s4
-; CHECK-NEXT:    vmov.f32 s13, s8
-; CHECK-NEXT:    vmovx.f16 s14, s1
-; CHECK-NEXT:    vins.f16 s14, s6
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov r2, s13
-; CHECK-NEXT:    vmov r3, s14
-; CHECK-NEXT:    stm r1!, {r0, r2, r3}
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vmovx.f16 s10, s4
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vins.f16 s2, s10
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    str r0, [r1, #8]
+; CHECK-NEXT:    strd r3, r2, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
@@ -1388,6 +1338,8 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    ldrd r2, r12, [r0]
 ; CHECK-NEXT:    ldrd r3, lr, [r0, #8]
 ; CHECK-NEXT:    vmov.32 q0[0], r2
@@ -1397,28 +1349,29 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK-NEXT:    vmov.32 q1[1], lr
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmovx.f16 s10, s0
+; CHECK-NEXT:    vmov.f32 s8, s1
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vins.f16 s8, s5
 ; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmovx.f16 s8, s0
 ; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
 ; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmov.f32 s9, s4
-; CHECK-NEXT:    vmovx.f16 s10, s2
-; CHECK-NEXT:    vins.f16 s10, s12
-; CHECK-NEXT:    vmovx.f16 s12, s1
-; CHECK-NEXT:    vins.f16 s1, s3
-; CHECK-NEXT:    vmov.f32 s11, s1
-; CHECK-NEXT:    vmovx.f16 s1, s3
-; CHECK-NEXT:    vstrw.32 q2, [r1]
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vins.f16 s5, s12
-; CHECK-NEXT:    vmov.f32 s0, s5
-; CHECK-NEXT:    vins.f16 s1, s8
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    strd r2, r0, [r1, #16]
+; CHECK-NEXT:    vins.f16 s4, s10
+; CHECK-NEXT:    vins.f16 s2, s12
+; CHECK-NEXT:    vmovx.f16 s10, s1
+; CHECK-NEXT:    vmovx.f16 s12, s5
+; CHECK-NEXT:    vmovx.f16 s17, s3
+; CHECK-NEXT:    vins.f16 s5, s10
+; CHECK-NEXT:    vins.f16 s17, s12
+; CHECK-NEXT:    vmov.f32 s16, s5
+; CHECK-NEXT:    vmov r2, s17
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s3, s8
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vmov r0, s16
+; CHECK-NEXT:    strd r0, r2, [r1, #16]
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
@@ -1439,61 +1392,63 @@ define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
+; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vmovx.f16 s6, s20
+; CHECK-NEXT:    vmovx.f16 s12, s8
+; CHECK-NEXT:    vmovx.f16 s24, s23
+; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vins.f16 s0, s20
+; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vins.f16 s4, s21
+; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmov.f32 s3, s4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vmovx.f16 s0, s8
-; CHECK-NEXT:    vmovx.f16 s20, s12
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vmovx.f16 s24, s6
-; CHECK-NEXT:    vmov.f64 d0, d6
-; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vmov.f32 s1, s8
 ; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmov.16 q0[4], r2
+; CHECK-NEXT:    vmovx.f16 s26, s6
 ; CHECK-NEXT:    vmov.f32 s18, s4
-; CHECK-NEXT:    vmov.f32 s3, s13
-; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vins.f16 s3, s9
-; CHECK-NEXT:    vmovx.f16 s20, s18
-; CHECK-NEXT:    vmov.f32 s1, s12
-; CHECK-NEXT:    vins.f16 s2, s20
-; CHECK-NEXT:    vmovx.f16 s20, s10
-; CHECK-NEXT:    vins.f16 s20, s24
-; CHECK-NEXT:    vmov r0, s11
-; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmovx.f16 s24, s7
-; CHECK-NEXT:    vmovx.f16 s23, s11
-; CHECK-NEXT:    vrev32.16 q2, q2
-; CHECK-NEXT:    vins.f16 s23, s24
-; CHECK-NEXT:    vmov.f32 s25, s15
-; CHECK-NEXT:    vmov.f32 s22, s7
-; CHECK-NEXT:    vmovx.f16 s28, s21
-; CHECK-NEXT:    vmov.f32 s26, s15
+; CHECK-NEXT:    vins.f16 s17, s12
+; CHECK-NEXT:    vmovx.f16 s12, s18
+; CHECK-NEXT:    vins.f16 s2, s12
+; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vins.f16 s24, s12
+; CHECK-NEXT:    vmovx.f16 s12, s22
+; CHECK-NEXT:    vmov r0, s23
+; CHECK-NEXT:    vins.f16 s12, s26
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vrev32.16 q5, q5
+; CHECK-NEXT:    vmov.f32 s15, s24
+; CHECK-NEXT:    vmov.f32 s25, s11
+; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    vmovx.f16 s28, s13
+; CHECK-NEXT:    vmov.f32 s26, s11
 ; CHECK-NEXT:    vins.f16 s25, s28
 ; CHECK-NEXT:    vmovx.f16 s28, s26
-; CHECK-NEXT:    vins.f16 s22, s28
-; CHECK-NEXT:    vmovx.f16 s28, s13
+; CHECK-NEXT:    vins.f16 s14, s28
+; CHECK-NEXT:    vmovx.f16 s28, s9
 ; CHECK-NEXT:    vmov.f32 s4, s5
 ; CHECK-NEXT:    vins.f16 s4, s28
-; CHECK-NEXT:    vmovx.f16 s28, s14
+; CHECK-NEXT:    vmovx.f16 s28, s10
 ; CHECK-NEXT:    vins.f16 s6, s28
 ; CHECK-NEXT:    vmov.f32 s18, s2
 ; CHECK-NEXT:    vmov.f32 s7, s6
-; CHECK-NEXT:    vmov.f32 s6, s14
-; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vins.f16 s9, s12
-; CHECK-NEXT:    vmovx.f16 s12, s10
-; CHECK-NEXT:    vins.f16 s6, s12
-; CHECK-NEXT:    vmov.f32 s26, s22
-; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmovx.f16 s8, s5
+; CHECK-NEXT:    vins.f16 s21, s8
+; CHECK-NEXT:    vmovx.f16 s8, s22
+; CHECK-NEXT:    vins.f16 s6, s8
+; CHECK-NEXT:    vmov.f32 s26, s14
+; CHECK-NEXT:    vmov.f32 s22, s6
 ; CHECK-NEXT:    vmov.f32 s1, s17
-; CHECK-NEXT:    vmov.f32 s21, s25
-; CHECK-NEXT:    vmov.f32 s5, s9
+; CHECK-NEXT:    vmov.f32 s13, s25
+; CHECK-NEXT:    vmov.f32 s5, s21
 ; CHECK-NEXT:    vmov.f32 s2, s18
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vmov.f32 s22, s26
-; CHECK-NEXT:    vmov.f32 s6, s10
-; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s14, s26
+; CHECK-NEXT:    vmov.f32 s6, s22
+; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
 ; CHECK-NEXT:    bx lr
@@ -1516,146 +1471,151 @@ define void @vst3_v16f16(<16 x half> *%src, <48 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #144
-; CHECK-NEXT:    sub sp, #144
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmovx.f16 s0, s22
-; CHECK-NEXT:    vmovx.f16 s4, s10
+; CHECK-NEXT:    .pad #128
+; CHECK-NEXT:    sub sp, #128
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT:    vmovx.f16 s0, s31
+; CHECK-NEXT:    vmovx.f16 s2, s15
+; CHECK-NEXT:    vins.f16 s2, s0
+; CHECK-NEXT:    vmovx.f16 s0, s30
+; CHECK-NEXT:    vmovx.f16 s4, s14
+; CHECK-NEXT:    vmov r2, s15
 ; CHECK-NEXT:    vins.f16 s4, s0
-; CHECK-NEXT:    vmov r2, s11
+; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vmov.16 q1[3], r2
-; CHECK-NEXT:    vmovx.f16 s0, s23
-; CHECK-NEXT:    vmovx.f16 s7, s11
-; CHECK-NEXT:    vstrw.32 q2, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vins.f16 s7, s0
-; CHECK-NEXT:    vmov.f32 s9, s31
-; CHECK-NEXT:    vmov.f32 s6, s23
+; CHECK-NEXT:    vstrw.32 q3, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s7, s2
+; CHECK-NEXT:    vmovx.f16 s2, s20
+; CHECK-NEXT:    vmov.f32 s6, s31
 ; CHECK-NEXT:    vmovx.f16 s0, s5
-; CHECK-NEXT:    vmov.f32 s10, s31
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
-; CHECK-NEXT:    vins.f16 s9, s0
-; CHECK-NEXT:    vmovx.f16 s0, s10
-; CHECK-NEXT:    vins.f16 s6, s0
-; CHECK-NEXT:    vmovx.f16 s0, s16
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT:    vmov.f64 d0, d6
-; CHECK-NEXT:    vstrw.32 q1, [sp, #128] @ 16-byte Spill
-; CHECK-NEXT:    vins.f16 s0, s16
-; CHECK-NEXT:    vmov.f32 s10, s6
-; CHECK-NEXT:    vmov.16 q0[4], r2
-; CHECK-NEXT:    vstrw.32 q2, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s3, s13
-; CHECK-NEXT:    vmov.f32 s9, s24
-; CHECK-NEXT:    vins.f16 s3, s17
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovx.f16 s0, s12
-; CHECK-NEXT:    vmov.f32 s10, s24
-; CHECK-NEXT:    vins.f16 s9, s0
-; CHECK-NEXT:    vmov.f32 s5, s12
-; CHECK-NEXT:    vmovx.f16 s0, s10
-; CHECK-NEXT:    vins.f16 s6, s0
-; CHECK-NEXT:    vmov.f32 s10, s6
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vmov.f32 s5, s19
+; CHECK-NEXT:    vmov.f32 s6, s19
+; CHECK-NEXT:    vstrw.32 q4, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vins.f16 s5, s0
+; CHECK-NEXT:    vmovx.f16 s0, s6
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q2, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s9, s20
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vmov.f32 s10, s20
-; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vmov.f64 d0, d14
-; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vmov.f32 s20, s21
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.f32 s3, s29
-; CHECK-NEXT:    vins.f16 s3, s5
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovx.f16 s0, s28
-; CHECK-NEXT:    vins.f16 s9, s0
-; CHECK-NEXT:    vmov.f32 s5, s28
-; CHECK-NEXT:    vmovx.f16 s0, s10
-; CHECK-NEXT:    vins.f16 s6, s0
-; CHECK-NEXT:    vmov.f32 s10, s6
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov q1, q4
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vins.f16 s10, s0
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmovx.f16 s16, s26
-; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vins.f16 s8, s16
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.16 q2[3], r0
+; CHECK-NEXT:    vmov.f64 d4, d2
 ; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmovx.f16 s11, s7
-; CHECK-NEXT:    vmovx.f16 s16, s27
-; CHECK-NEXT:    vmov q1, q3
-; CHECK-NEXT:    vins.f16 s11, s16
-; CHECK-NEXT:    vmov.f32 s1, s7
-; CHECK-NEXT:    vmov.f32 s10, s27
-; CHECK-NEXT:    vmovx.f16 s16, s9
-; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vmovx.f16 s2, s12
+; CHECK-NEXT:    vins.f16 s8, s20
+; CHECK-NEXT:    vmov.f32 s0, s5
+; CHECK-NEXT:    vins.f16 s0, s21
+; CHECK-NEXT:    vmov.16 q2[4], r2
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s11, s0
+; CHECK-NEXT:    vmov.f32 s9, s4
+; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vmov.f32 s5, s20
+; CHECK-NEXT:    vmov.f32 s6, s20
+; CHECK-NEXT:    vins.f16 s5, s0
+; CHECK-NEXT:    vmovx.f16 s0, s6
+; CHECK-NEXT:    vstrw.32 q1, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d2, d8
+; CHECK-NEXT:    vins.f16 s10, s0
+; CHECK-NEXT:    vstrw.32 q2, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vmov q2, q6
+; CHECK-NEXT:    vmovx.f16 s24, s10
+; CHECK-NEXT:    vmov.f32 s0, s17
+; CHECK-NEXT:    vins.f16 s4, s12
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vins.f16 s0, s13
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov.f32 s13, s28
+; CHECK-NEXT:    vmov.f32 s7, s0
+; CHECK-NEXT:    vmovx.f16 s0, s16
+; CHECK-NEXT:    vmov.f32 s14, s28
+; CHECK-NEXT:    vmovx.f16 s2, s11
+; CHECK-NEXT:    vins.f16 s13, s0
+; CHECK-NEXT:    vmov.f32 s5, s16
+; CHECK-NEXT:    vmovx.f16 s0, s14
+; CHECK-NEXT:    vmov r0, s11
+; CHECK-NEXT:    vins.f16 s6, s0
+; CHECK-NEXT:    vmovx.f16 s0, s22
+; CHECK-NEXT:    vins.f16 s24, s0
+; CHECK-NEXT:    vstrw.32 q1, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vmov q1, q2
+; CHECK-NEXT:    vmovx.f16 s0, s23
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s2, s0
+; CHECK-NEXT:    vmov.16 q6[3], r0
+; CHECK-NEXT:    vrev32.16 q1, q1
+; CHECK-NEXT:    vmov.f32 s27, s2
+; CHECK-NEXT:    vmov.f32 s1, s11
+; CHECK-NEXT:    vmov.f32 s26, s23
+; CHECK-NEXT:    vmovx.f16 s16, s25
+; CHECK-NEXT:    vmov.f32 s2, s11
 ; CHECK-NEXT:    vins.f16 s1, s16
 ; CHECK-NEXT:    vmovx.f16 s16, s2
-; CHECK-NEXT:    vins.f16 s10, s16
-; CHECK-NEXT:    vmovx.f16 s16, s29
-; CHECK-NEXT:    vmov.f32 s2, s10
+; CHECK-NEXT:    vins.f16 s26, s16
+; CHECK-NEXT:    vmovx.f16 s16, s9
+; CHECK-NEXT:    vmov.f32 s20, s21
 ; CHECK-NEXT:    vins.f16 s20, s16
-; CHECK-NEXT:    vmovx.f16 s16, s30
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmovx.f16 s16, s10
 ; CHECK-NEXT:    vins.f16 s22, s16
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s2, s26
 ; CHECK-NEXT:    vmov.f32 s23, s22
-; CHECK-NEXT:    vmov.f32 s22, s30
-; CHECK-NEXT:    vrev32.16 q3, q0
+; CHECK-NEXT:    vmov.f32 s22, s10
 ; CHECK-NEXT:    vmovx.f16 s16, s21
-; CHECK-NEXT:    vmov.f32 s24, s25
-; CHECK-NEXT:    vins.f16 s13, s16
-; CHECK-NEXT:    vmovx.f16 s16, s14
-; CHECK-NEXT:    vins.f16 s22, s16
-; CHECK-NEXT:    vmovx.f16 s16, s5
-; CHECK-NEXT:    vins.f16 s24, s16
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s5, s16
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s25, s1
+; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q4, [sp, #112] @ 16-byte Spill
 ; CHECK-NEXT:    vmovx.f16 s16, s6
-; CHECK-NEXT:    vins.f16 s26, s16
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s27, s26
-; CHECK-NEXT:    vmov.f32 s26, s6
-; CHECK-NEXT:    vrev32.16 q4, q0
-; CHECK-NEXT:    vmovx.f16 s4, s25
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vins.f16 s17, s4
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s5, s1
-; CHECK-NEXT:    vmovx.f16 s28, s18
-; CHECK-NEXT:    vmov.f32 s6, s2
-; CHECK-NEXT:    vins.f16 s26, s28
-; CHECK-NEXT:    vstrw.32 q1, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s14, s10
+; CHECK-NEXT:    vins.f16 s22, s16
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s14, s22
-; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s29, s5
-; CHECK-NEXT:    vmov.f32 s30, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s5, s1
-; CHECK-NEXT:    vstrw.32 q7, [r1]
-; CHECK-NEXT:    vmov.f32 s6, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s21, s13
-; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s9, s1
-; CHECK-NEXT:    vmov.f32 s10, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s25, s17
-; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s22, s14
-; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s26, s18
+; CHECK-NEXT:    vmov.f32 s28, s29
+; CHECK-NEXT:    vmovx.f16 s8, s17
+; CHECK-NEXT:    vmov.f32 s26, s2
+; CHECK-NEXT:    vmov.f32 s5, s13
+; CHECK-NEXT:    vins.f16 s28, s8
+; CHECK-NEXT:    vmovx.f16 s0, s18
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s30, s0
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s6, s14
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s31, s30
+; CHECK-NEXT:    vrev32.16 q0, q0
+; CHECK-NEXT:    vmov.f32 s30, s18
+; CHECK-NEXT:    vmovx.f16 s16, s29
+; CHECK-NEXT:    vmov.f32 s9, s13
+; CHECK-NEXT:    vins.f16 s1, s16
+; CHECK-NEXT:    vmov.f32 s10, s14
+; CHECK-NEXT:    vmovx.f16 s16, s2
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s30, s16
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s2, s30
+; CHECK-NEXT:    vmov.f32 s18, s14
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s13, s17
+; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s29, s1
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s30, s2
+; CHECK-NEXT:    vmov.f32 s14, s18
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s22
+; CHECK-NEXT:    vstrw.32 q7, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s21, s17
+; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s22, s18
 ; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
-; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
-; CHECK-NEXT:    add sp, #144
+; CHECK-NEXT:    add sp, #128
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index f7762f562936..5e9eaed161da 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -422,65 +422,53 @@ define void @vst4_v8i16_align1(<8 x i16> *%src, <32 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d0, d8
-; CHECK-NEXT:    vmov.u16 r0, q4[1]
-; CHECK-NEXT:    vmov.f32 s21, s5
-; CHECK-NEXT:    vins.f16 s0, s8
-; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vins.f16 s21, s21
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.f32 s12, s17
-; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vins.f16 s12, s9
-; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[3]
-; CHECK-NEXT:    vmov.f32 s25, s4
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vins.f16 s25, s25
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q6[6], r0
-; CHECK-NEXT:    vmov.f32 s13, s21
-; CHECK-NEXT:    vmov.16 q6[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.f32 s15, s23
-; CHECK-NEXT:    vmov.f32 s20, s19
-; CHECK-NEXT:    vmov.u16 r0, q4[7]
-; CHECK-NEXT:    vmov.f32 s1, s25
-; CHECK-NEXT:    vins.f16 s20, s11
-; CHECK-NEXT:    vmov.f32 s3, s27
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov.f32 s25, s7
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
-; CHECK-NEXT:    vins.f16 s25, s25
-; CHECK-NEXT:    vstrb.8 q3, [r1, #16]
-; CHECK-NEXT:    vmov.16 q6[6], r0
-; CHECK-NEXT:    vstrb.8 q0, [r1]
-; CHECK-NEXT:    vmov.16 q6[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[5]
-; CHECK-NEXT:    vmov.f32 s21, s25
-; CHECK-NEXT:    vmov.f32 s23, s27
-; CHECK-NEXT:    vmov.f64 d12, d9
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vins.f16 s2, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vins.f16 s5, s9
+; CHECK-NEXT:    vmovx.f16 s24, s11
+; CHECK-NEXT:    vmovx.f16 s16, s13
+; CHECK-NEXT:    vins.f16 s13, s13
+; CHECK-NEXT:    vins.f16 s16, s16
+; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    vmov.f32 s23, s16
+; CHECK-NEXT:    vmovx.f16 s16, s8
+; CHECK-NEXT:    vmov.f32 s0, s5
+; CHECK-NEXT:    vmov.f32 s1, s21
+; CHECK-NEXT:    vmov.f32 s3, s23
+; CHECK-NEXT:    vmovx.f16 s20, s4
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vins.f16 s20, s16
+; CHECK-NEXT:    vmov q4, q1
+; CHECK-NEXT:    vmovx.f16 s8, s10
+; CHECK-NEXT:    vmov.f32 s18, s20
+; CHECK-NEXT:    vmovx.f16 s22, s7
+; CHECK-NEXT:    vins.f16 s7, s11
+; CHECK-NEXT:    vins.f16 s22, s24
+; CHECK-NEXT:    vmovx.f16 s26, s6
+; CHECK-NEXT:    vmovx.f16 s19, s12
+; CHECK-NEXT:    vins.f16 s12, s12
+; CHECK-NEXT:    vmov.f32 s20, s7
+; CHECK-NEXT:    vins.f16 s6, s10
+; CHECK-NEXT:    vins.f16 s26, s8
+; CHECK-NEXT:    vmov.f32 s17, s12
+; CHECK-NEXT:    vmovx.f16 s23, s15
+; CHECK-NEXT:    vins.f16 s15, s15
+; CHECK-NEXT:    vmov.f32 s24, s6
+; CHECK-NEXT:    vmov.f32 s21, s15
+; CHECK-NEXT:    vmovx.f16 s27, s14
+; CHECK-NEXT:    vins.f16 s14, s14
+; CHECK-NEXT:    vins.f16 s19, s19
+; CHECK-NEXT:    vmov.f32 s25, s14
+; CHECK-NEXT:    vins.f16 s23, s23
+; CHECK-NEXT:    vins.f16 s27, s27
 ; CHECK-NEXT:    vstrb.8 q5, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s17, s6
-; CHECK-NEXT:    vins.f16 s24, s10
-; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vins.f16 s17, s17
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q4[6], r0
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q6[5], r0
-; CHECK-NEXT:    vmov.f32 s25, s17
-; CHECK-NEXT:    vmov.f32 s27, s19
 ; CHECK-NEXT:    vstrb.8 q6, [r1, #32]
+; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
+; CHECK-NEXT:    vstrb.8 q4, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1044,22 +1032,19 @@ entry:
 define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vst4_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldmia r0, {s4, s5}
-; CHECK-NEXT:    vldr s0, [r0, #8]
-; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vins.f16 s4, s5
-; CHECK-NEXT:    vmov.f32 s1, s0
-; CHECK-NEXT:    vmovx.f16 s14, s0
-; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vldmia r0, {s0, s1}
+; CHECK-NEXT:    vldr s4, [r0, #8]
+; CHECK-NEXT:    vmovx.f16 s2, s0
 ; CHECK-NEXT:    vins.f16 s0, s1
-; CHECK-NEXT:    vmovx.f16 s4, s5
-; CHECK-NEXT:    vmov.f32 s9, s0
-; CHECK-NEXT:    vins.f16 s12, s4
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vmov.f32 s10, s12
-; CHECK-NEXT:    vins.f16 s14, s0
-; CHECK-NEXT:    vmov.f32 s11, s14
-; CHECK-NEXT:    vstrh.16 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s5, s4
+; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vins.f16 s4, s5
+; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vmovx.f16 s8, s5
+; CHECK-NEXT:    vins.f16 s3, s8
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <2 x half>, <2 x half>* %src, i32 0
@@ -1082,40 +1067,40 @@ define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    ldrd lr, r12, [r0]
-; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
-; CHECK-NEXT:    vmov.32 q0[0], lr
+; CHECK-NEXT:    ldrd r2, r12, [r0]
+; CHECK-NEXT:    ldrd r3, lr, [r0, #8]
+; CHECK-NEXT:    vmov.32 q0[0], r2
+; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
 ; CHECK-NEXT:    vmov.32 q1[0], r3
 ; CHECK-NEXT:    vmov.32 q0[1], r12
-; CHECK-NEXT:    vmov.32 q1[1], r2
-; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
+; CHECK-NEXT:    vmov.32 q1[1], lr
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vmov.f32 s8, s1
-; CHECK-NEXT:    vmovx.f16 s12, s3
-; CHECK-NEXT:    vins.f16 s8, s5
-; CHECK-NEXT:    vmov.32 q1[0], r2
-; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov.f32 s6, s4
-; CHECK-NEXT:    vmov.f32 s7, s5
-; CHECK-NEXT:    vmov.f32 s9, s5
-; CHECK-NEXT:    vins.f16 s9, s5
-; CHECK-NEXT:    vmovx.f16 s10, s1
-; CHECK-NEXT:    vins.f16 s10, s12
-; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vmovx.f16 s11, s5
-; CHECK-NEXT:    vins.f16 s11, s12
-; CHECK-NEXT:    vstrh.16 q2, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vmovx.f16 s3, s4
-; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vmov.f32 s1, s4
-; CHECK-NEXT:    vmovx.f16 s10, s2
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vins.f16 s12, s4
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vins.f16 s6, s8
+; CHECK-NEXT:    vmov.32 q2[0], r2
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.f32 s10, s8
+; CHECK-NEXT:    vmov.f32 s11, s9
+; CHECK-NEXT:    vmovx.f16 s3, s8
 ; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vmovx.f16 s4, s6
-; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vins.f16 s3, s4
+; CHECK-NEXT:    vmovx.f16 s14, s10
+; CHECK-NEXT:    vmovx.f16 s7, s9
+; CHECK-NEXT:    vins.f16 s3, s14
+; CHECK-NEXT:    vins.f16 s9, s11
+; CHECK-NEXT:    vmovx.f16 s14, s11
+; CHECK-NEXT:    vins.f16 s7, s14
+; CHECK-NEXT:    vmov.f32 s4, s1
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmov.f32 s5, s9
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vstrh.16 q1, [r1, #16]
 ; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -1204,56 +1189,63 @@ entry:
 define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) {
 ; CHECK-LABEL: vst4_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q5, [r0]
-; CHECK-NEXT:    vmovx.f16 s2, s21
-; CHECK-NEXT:    vins.f16 s21, s17
-; CHECK-NEXT:    vmov.f32 s0, s21
-; CHECK-NEXT:    vmovx.f16 s4, s17
-; CHECK-NEXT:    vins.f16 s2, s4
-; CHECK-NEXT:    vmovx.f16 s8, s20
-; CHECK-NEXT:    vmovx.f16 s4, s16
-; CHECK-NEXT:    vins.f16 s20, s16
-; CHECK-NEXT:    vins.f16 s8, s4
-; CHECK-NEXT:    vmov q1, q5
-; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vmovx.f16 s10, s23
-; CHECK-NEXT:    vins.f16 s23, s19
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmovx.f16 s30, s5
+; CHECK-NEXT:    vmovx.f16 s8, s17
+; CHECK-NEXT:    vins.f16 s5, s17
+; CHECK-NEXT:    vins.f16 s30, s8
+; CHECK-NEXT:    vmovx.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s8, s16
+; CHECK-NEXT:    vins.f16 s4, s16
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vmovx.f16 s10, s7
 ; CHECK-NEXT:    vmovx.f16 s12, s19
-; CHECK-NEXT:    vmov.f32 s8, s23
-; CHECK-NEXT:    vmovx.f16 s16, s18
+; CHECK-NEXT:    vins.f16 s7, s19
 ; CHECK-NEXT:    vins.f16 s10, s12
-; CHECK-NEXT:    vmovx.f16 s14, s22
-; CHECK-NEXT:    vins.f16 s22, s18
-; CHECK-NEXT:    vmov.f32 s12, s22
+; CHECK-NEXT:    vmovx.f16 s14, s6
+; CHECK-NEXT:    vmovx.f16 s16, s18
+; CHECK-NEXT:    vins.f16 s6, s18
 ; CHECK-NEXT:    vins.f16 s14, s16
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s13, s18
+; CHECK-NEXT:    vstr s0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vmov.f32 s28, s5
 ; CHECK-NEXT:    vmovx.f16 s24, s17
 ; CHECK-NEXT:    vins.f16 s17, s17
-; CHECK-NEXT:    vins.f16 s24, s24
-; CHECK-NEXT:    vmov q5, q4
-; CHECK-NEXT:    vmovx.f16 s7, s16
-; CHECK-NEXT:    vmov.f32 s23, s24
+; CHECK-NEXT:    vmovx.f16 s23, s16
 ; CHECK-NEXT:    vins.f16 s16, s16
-; CHECK-NEXT:    vmov.f32 s5, s16
 ; CHECK-NEXT:    vmovx.f16 s11, s19
 ; CHECK-NEXT:    vins.f16 s19, s19
-; CHECK-NEXT:    vins.f16 s13, s13
-; CHECK-NEXT:    vmov.f32 s9, s19
 ; CHECK-NEXT:    vmovx.f16 s15, s18
-; CHECK-NEXT:    vmov.f32 s1, s21
-; CHECK-NEXT:    vins.f16 s7, s7
+; CHECK-NEXT:    vins.f16 s18, s18
+; CHECK-NEXT:    vins.f16 s24, s24
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov.f32 s3, s24
+; CHECK-NEXT:    vmov q6, q1
+; CHECK-NEXT:    vmov.f32 s29, s1
 ; CHECK-NEXT:    vins.f16 s11, s11
 ; CHECK-NEXT:    vins.f16 s15, s15
+; CHECK-NEXT:    vins.f16 s23, s23
+; CHECK-NEXT:    vmov.f32 s8, s7
+; CHECK-NEXT:    vmov.f32 s12, s6
+; CHECK-NEXT:    vmov.f32 s9, s19
+; CHECK-NEXT:    vmov.f32 s13, s18
 ; CHECK-NEXT:    vstrb.8 q2, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s31, s3
+; CHECK-NEXT:    vldr s0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vmov.f32 s21, s16
 ; CHECK-NEXT:    vstrb.8 q3, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s3, s23
-; CHECK-NEXT:    vstrb.8 q1, [r1]
-; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vmov.f32 s26, s0
+; CHECK-NEXT:    vstrb.8 q7, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s25, s16
+; CHECK-NEXT:    vmov.f32 s27, s23
+; CHECK-NEXT:    vstrb.8 q6, [r1]
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0


        


More information about the llvm-commits mailing list