[llvm] f8abecf - [ARM] Extra MVE select(binop) patterns

David Green via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 22 06:08:43 PDT 2020


Author: David Green
Date: 2020-07-22T14:08:29+01:00
New Revision: f8abecf3379de841c436f353f060929722af8602

URL: https://github.com/llvm/llvm-project/commit/f8abecf3379de841c436f353f060929722af8602
DIFF: https://github.com/llvm/llvm-project/commit/f8abecf3379de841c436f353f060929722af8602.diff

LOG: [ARM] Extra MVE select(binop) patterns

This is very similar to 243970d03cace2, but handling a slightly
different form of predicated operations. When starting with a pattern of
the form select(p, BinOp(x, y), x), Instcombine will often transform
this to BinOp(x, select(p, y, 0)), where 0 is the identity value of the
binop (0 for adds/subs, 1 for muls, -1 for ands etc). This adds the
patterns that transforms those back into predicated binary operations.

There is also a very minor adjustment to tablegen null_frag in here, to
allow it to also be recognized as a PatLeaf node, so that it can be used
in MVE_TwoOpPattern to easily exclude the cases where we do not need the
alternate transform.

Differential Revision: https://reviews.llvm.org/D84091

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMInstrInfo.td
    llvm/lib/Target/ARM/ARMInstrMVE.td
    llvm/lib/Target/ARM/ARMInstrNEON.td
    llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
    llvm/utils/TableGen/CodeGenDAGPatterns.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index da0a836c8f95..db2ab7a97aa6 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -498,6 +498,18 @@ def SubReg_i32_lane : SDNodeXForm<imm, [{
 }]>;
 
 
+def ARMimmAllZerosV: PatLeaf<(bitconvert (v4i32 (ARMvmovImm (i32 0))))>;
+def ARMimmAllZerosD: PatLeaf<(bitconvert (v2i32 (ARMvmovImm (i32 0))))>;
+def ARMimmAllOnesV: PatLeaf<(bitconvert (v16i8 (ARMvmovImm (i32 0xEFF))))>;
+def ARMimmAllOnesD: PatLeaf<(bitconvert (v8i8 (ARMvmovImm (i32 0xEFF))))>;
+
+def ARMimmOneV: PatLeaf<(ARMvmovImm (i32 timm)), [{
+  ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
+  unsigned EltBits = 0;
+  uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
+  return (EltBits == N->getValueType(0).getScalarSizeInBits() && EltVal == 0x01);
+}]>;
+
 
 //===----------------------------------------------------------------------===//
 // Operand Definitions.

diff  --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 95f41abf308b..b8c651a81452 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -318,9 +318,9 @@ def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?,     v4i1,  ?,    0b11, "f", ?>;
 def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b11, "p", 0b0>;
 def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  v4i1, 0b11, "p", 0b1>;
 
-
 multiclass MVE_TwoOpPattern<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt,
-                            dag PredOperands, Instruction Inst> {
+                            dag PredOperands, Instruction Inst,
+                            SDPatternOperator IdentityVec = null_frag> {
   // Unpredicated
   def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
             (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
@@ -334,6 +334,15 @@ multiclass MVE_TwoOpPattern<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt,
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                               ARMVCCThen, (VTI.Pred VCCR:$mask),
                               (VTI.Vec MQPR:$inactive)))>;
+
+    // Optionally with the select folded through the op
+    def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm),
+                           (VTI.Vec (vselect (VTI.Pred VCCR:$mask),
+                                             (VTI.Vec MQPR:$Qn),
+                                             (VTI.Vec IdentityVec))))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                              ARMVCCThen, (VTI.Pred VCCR:$mask),
+                              (VTI.Vec MQPR:$Qm)))>;
   }
 
   // Predicated with intrinsic
@@ -346,7 +355,8 @@ multiclass MVE_TwoOpPattern<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt,
 }
 
 multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredInt,
-                               dag PredOperands, Instruction Inst> {
+                               dag PredOperands, Instruction Inst,
+                               SDPatternOperator IdentityVec = null_frag> {
   // Unpredicated
   def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm), (VTI.Vec (ARMvdup rGPR:$Rn)))),
             (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn))>;
@@ -360,6 +370,15 @@ multiclass MVE_TwoOpPatternDup<MVEVectorVTInfo VTI, PatFrag Op, Intrinsic PredIn
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn,
                               ARMVCCThen, (VTI.Pred VCCR:$mask),
                               (VTI.Vec MQPR:$inactive)))>;
+
+    // Optionally with the select folded through the op
+    def : Pat<(VTI.Vec (Op (VTI.Vec MQPR:$Qm),
+                           (VTI.Vec (vselect (VTI.Pred VCCR:$mask),
+                                             (ARMvdup rGPR:$Rn),
+                                             (VTI.Vec IdentityVec))))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), rGPR:$Rn,
+                              ARMVCCThen, (VTI.Pred VCCR:$mask),
+                              (VTI.Vec MQPR:$Qm)))>;
   }
 
   // Predicated with intrinsic
@@ -1492,20 +1511,20 @@ foreach s=["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f
 }
 
 let Predicates = [HasMVEInt] in {
-  defm : MVE_TwoOpPattern<MVE_v16i8, and, int_arm_mve_and_predicated, (? ), MVE_VAND>;
-  defm : MVE_TwoOpPattern<MVE_v8i16, and, int_arm_mve_and_predicated, (? ), MVE_VAND>;
-  defm : MVE_TwoOpPattern<MVE_v4i32, and, int_arm_mve_and_predicated, (? ), MVE_VAND>;
-  defm : MVE_TwoOpPattern<MVE_v2i64, and, int_arm_mve_and_predicated, (? ), MVE_VAND>;
+  defm : MVE_TwoOpPattern<MVE_v16i8, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>;
+  defm : MVE_TwoOpPattern<MVE_v8i16, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>;
+  defm : MVE_TwoOpPattern<MVE_v4i32, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>;
+  defm : MVE_TwoOpPattern<MVE_v2i64, and, int_arm_mve_and_predicated, (? ), MVE_VAND, ARMimmAllOnesV>;
 
-  defm : MVE_TwoOpPattern<MVE_v16i8, or, int_arm_mve_orr_predicated, (? ), MVE_VORR>;
-  defm : MVE_TwoOpPattern<MVE_v8i16, or, int_arm_mve_orr_predicated, (? ), MVE_VORR>;
-  defm : MVE_TwoOpPattern<MVE_v4i32, or, int_arm_mve_orr_predicated, (? ), MVE_VORR>;
-  defm : MVE_TwoOpPattern<MVE_v2i64, or, int_arm_mve_orr_predicated, (? ), MVE_VORR>;
+  defm : MVE_TwoOpPattern<MVE_v16i8, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>;
+  defm : MVE_TwoOpPattern<MVE_v8i16, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>;
+  defm : MVE_TwoOpPattern<MVE_v4i32, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>;
+  defm : MVE_TwoOpPattern<MVE_v2i64, or, int_arm_mve_orr_predicated, (? ), MVE_VORR, ARMimmAllZerosV>;
 
-  defm : MVE_TwoOpPattern<MVE_v16i8, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR>;
-  defm : MVE_TwoOpPattern<MVE_v8i16, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR>;
-  defm : MVE_TwoOpPattern<MVE_v4i32, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR>;
-  defm : MVE_TwoOpPattern<MVE_v2i64, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR>;
+  defm : MVE_TwoOpPattern<MVE_v16i8, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>;
+  defm : MVE_TwoOpPattern<MVE_v8i16, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>;
+  defm : MVE_TwoOpPattern<MVE_v4i32, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>;
+  defm : MVE_TwoOpPattern<MVE_v2i64, xor, int_arm_mve_eor_predicated, (? ), MVE_VEOR, ARMimmAllZerosV>;
 
   defm : MVE_TwoOpPattern<MVE_v16i8, BinOpFrag<(and node:$LHS, (vnotq node:$RHS))>,
                           int_arm_mve_bic_predicated, (? ), MVE_VBIC>;
@@ -1775,7 +1794,7 @@ multiclass MVE_VMUL_m<MVEVectorVTInfo VTI> {
 
   let Predicates = [HasMVEInt] in {
     defm : MVE_TwoOpPattern<VTI, mul, int_arm_mve_mul_predicated, (? ),
-                            !cast<Instruction>(NAME)>;
+                            !cast<Instruction>(NAME), ARMimmOneV>;
   }
 }
 
@@ -1849,7 +1868,7 @@ multiclass MVE_VADDSUB_m<string iname, MVEVectorVTInfo VTI, bit subtract,
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEInt] in {
-    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>;
+    defm : MVE_TwoOpPattern<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), ARMimmAllZerosV>;
   }
 }
 
@@ -4984,7 +5003,7 @@ multiclass MVE_VADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
                             SDNode Op, Intrinsic PredInt> {
   def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b0, subtract, 0b1, 0b0>;
   let Predicates = [HasMVEInt] in {
-    defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME)>;
+    defm : MVE_TwoOpPatternDup<VTI, Op, PredInt, (? ), !cast<Instruction>(NAME), ARMimmAllZerosV>;
   }
 }
 
@@ -5270,7 +5289,7 @@ class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size>
 multiclass MVE_VMUL_qr_int_m<MVEVectorVTInfo VTI> {
   def "" : MVE_VMUL_qr_int<"vmul", VTI.Suffix, VTI.Size>;
   defm : MVE_TwoOpPatternDup<VTI, mul, int_arm_mve_mul_predicated, (? ),
-                             !cast<Instruction>(NAME)>;
+                             !cast<Instruction>(NAME), ARMimmOneV>;
 }
 
 defm MVE_VMUL_qr_i8  : MVE_VMUL_qr_int_m<MVE_v16i8>;
@@ -6864,7 +6883,7 @@ class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
 
 class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
                                   PatFrag LoadKind, int shift>
-  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
+  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty (ARMvmovImm (i32 0))))),
         (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred))>;
 
 multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
@@ -7031,11 +7050,11 @@ multiclass MVEExtLoadStore<Instruction LoadSInst, Instruction LoadUInst, string
             (VT (LoadUInst taddrmode_imm7<Shift>:$addr))>;
 
   // Masked ext loads
-  def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+  def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))),
             (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
-  def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+  def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))),
             (VT (LoadSInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
-  def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+  def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT (ARMvmovImm (i32 0))))),
             (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
 }
 

diff  --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index c097a4ad4fac..b05a38803553 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -534,20 +534,6 @@ def NEONvtbl1     : SDNode<"ARMISD::VTBL1", SDTARMVTBL1>;
 def NEONvtbl2     : SDNode<"ARMISD::VTBL2", SDTARMVTBL2>;
 
 
-def NEONimmAllZerosV: PatLeaf<(ARMvmovImm (i32 timm)), [{
-  ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
-  unsigned EltBits = 0;
-  uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
-  return (EltBits == 32 && EltVal == 0);
-}]>;
-
-def NEONimmAllOnesV: PatLeaf<(ARMvmovImm (i32 timm)), [{
-  ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0));
-  unsigned EltBits = 0;
-  uint64_t EltVal = ARM_AM::decodeVMOVModImm(ConstVal->getZExtValue(), EltBits);
-  return (EltBits == 8 && EltVal == 0xff);
-}]>;
-
 //===----------------------------------------------------------------------===//
 // NEON load / store instructions
 //===----------------------------------------------------------------------===//
@@ -5273,9 +5259,9 @@ def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm",
 // Vector Bitwise Operations.
 
 def vnotd : PatFrag<(ops node:$in),
-                    (xor node:$in, (bitconvert (v8i8 NEONimmAllOnesV)))>;
+                    (xor node:$in, ARMimmAllOnesD)>;
 def vnotq : PatFrag<(ops node:$in),
-                    (xor node:$in, (bitconvert (v16i8 NEONimmAllOnesV)))>;
+                    (xor node:$in, ARMimmAllOnesV)>;
 
 
 //   VAND     : Vector Bitwise AND
@@ -6054,9 +6040,9 @@ defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0,
 // Vector Negate.
 
 def vnegd  : PatFrag<(ops node:$in),
-                     (sub (bitconvert (v2i32 NEONimmAllZerosV)), node:$in)>;
+                     (sub ARMimmAllZerosD, node:$in)>;
 def vnegq  : PatFrag<(ops node:$in),
-                     (sub (bitconvert (v4i32 NEONimmAllZerosV)), node:$in)>;
+                     (sub ARMimmAllZerosV, node:$in)>;
 
 class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
   : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$Vd), (ins DPR:$Vm),
@@ -6270,11 +6256,11 @@ defm : NEONImmReplicateInstAlias<i32, VMOVv2i32, VMOVv4i32,
 
 let AddedComplexity = 50, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
   def VMOVD0 : ARMPseudoExpand<(outs DPR:$Vd), (ins), 4, IIC_VMOVImm,
-                               [(set DPR:$Vd, (v2i32 NEONimmAllZerosV))],
+                               [(set DPR:$Vd, (v2i32 ARMimmAllZerosD))],
                                (VMOVv2i32 DPR:$Vd, 0, (ops 14, zero_reg))>,
                Requires<[HasZCZ]>;
   def VMOVQ0 : ARMPseudoExpand<(outs QPR:$Vd), (ins), 4, IIC_VMOVImm,
-                               [(set QPR:$Vd, (v4i32 NEONimmAllZerosV))],
+                               [(set QPR:$Vd, (v4i32 ARMimmAllZerosV))],
                                (VMOVv4i32 QPR:$Vd, 0, (ops 14, zero_reg))>,
                Requires<[HasZCZ]>;
 }

diff  --git a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll
index 00b6b8b5e5d9..df92d30da6af 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-selectop3.ll
@@ -4,10 +4,9 @@
 define arm_aapcs_vfpcc <4 x i32> @add_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: add_v4i32_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.32 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -19,10 +18,9 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @add_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: add_v8i16_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.16 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vadd.i16 q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -34,10 +32,9 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @add_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: add_v16i8_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.8 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vadd.i8 q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -49,10 +46,9 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @sub_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: sub_v4i32_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.32 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vsub.i32 q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.i32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -64,10 +60,9 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @sub_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: sub_v8i16_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.16 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vsub.i16 q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.i16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -79,10 +74,9 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @sub_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: sub_v16i8_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.8 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vsub.i8 q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsubt.i8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -94,10 +88,9 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @mul_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: mul_v4i32_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x1
 ; CHECK-NEXT:    vctp.32 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i32 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -109,10 +102,9 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @mul_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: mul_v8i16_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i16 q2, #0x1
 ; CHECK-NEXT:    vctp.16 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vmul.i16 q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i16 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -124,10 +116,9 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @mul_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: mul_v16i8_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q2, #0x1
 ; CHECK-NEXT:    vctp.8 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vmul.i8 q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i8 q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -139,10 +130,9 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @and_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: and_v4i32_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vctp.32 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -154,10 +144,9 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @and_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: and_v8i16_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vctp.16 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -169,10 +158,9 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @and_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: and_v16i8_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vctp.8 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -184,10 +172,9 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @or_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: or_v4i32_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.32 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -199,10 +186,9 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @or_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: or_v8i16_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.16 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -214,10 +200,9 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @or_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: or_v16i8_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.8 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -229,10 +214,9 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @xor_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: xor_v4i32_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.32 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    veor q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -244,10 +228,9 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @xor_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: xor_v8i16_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.16 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    veor q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -259,10 +242,9 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @xor_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: xor_v16i8_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.8 r0
-; CHECK-NEXT:    vpsel q1, q1, q2
-; CHECK-NEXT:    veor q0, q1, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -274,11 +256,10 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @andnot_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: andnot_v4i32_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vmvn q1, q1
 ; CHECK-NEXT:    vctp.32 r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    veort q2, q1, q2
-; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    vandt q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -291,11 +272,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @andnot_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: andnot_v8i16_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vmvn q1, q1
 ; CHECK-NEXT:    vctp.16 r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    veort q2, q1, q2
-; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    vandt q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -308,11 +288,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @andnot_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: andnot_v16i8_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vmvn q1, q1
 ; CHECK-NEXT:    vctp.8 r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    veort q2, q1, q2
-; CHECK-NEXT:    vand q0, q2, q0
+; CHECK-NEXT:    vandt q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -325,12 +304,10 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @ornot_v4i32_x(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: ornot_v4i32_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    vmov.i8 q3, #0xff
+; CHECK-NEXT:    vmvn q1, q1
 ; CHECK-NEXT:    vctp.32 r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    veort q2, q1, q3
-; CHECK-NEXT:    vorr q0, q2, q0
+; CHECK-NEXT:    vorrt q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -343,12 +320,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @ornot_v8i16_x(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: ornot_v8i16_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    vmov.i8 q3, #0xff
+; CHECK-NEXT:    vmvn q1, q1
 ; CHECK-NEXT:    vctp.16 r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    veort q2, q1, q3
-; CHECK-NEXT:    vorr q0, q2, q0
+; CHECK-NEXT:    vorrt q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -361,12 +336,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @ornot_v16i8_x(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: ornot_v16i8_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    vmov.i8 q3, #0xff
+; CHECK-NEXT:    vmvn q1, q1
 ; CHECK-NEXT:    vctp.8 r0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    veort q2, q1, q3
-; CHECK-NEXT:    vorr q0, q2, q0
+; CHECK-NEXT:    vorrt q0, q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -871,11 +844,9 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @addqr_v4i32_x(<4 x i32> %x, i32 %y, i32 %n) {
 ; CHECK-LABEL: addqr_v4i32_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vctp.32 r1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vdupt.32 q1, r0
-; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vaddt.i32 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -889,11 +860,9 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @addqr_v8i16_x(<8 x i16> %x, i16 %y, i32 %n) {
 ; CHECK-LABEL: addqr_v8i16_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vctp.16 r1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vdupt.16 q1, r0
-; CHECK-NEXT:    vadd.i16 q0, q1, q0
+; CHECK-NEXT:    vaddt.i16 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -907,11 +876,9 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @addqr_v16i8_x(<16 x i8> %x, i8 %y, i32 %n) {
 ; CHECK-LABEL: addqr_v16i8_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vctp.8 r1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vdupt.8 q1, r0
-; CHECK-NEXT:    vadd.i8 q0, q1, q0
+; CHECK-NEXT:    vaddt.i8 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -925,11 +892,9 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @subqr_v4i32_x(<4 x i32> %x, i32 %y, i32 %n) {
 ; CHECK-LABEL: subqr_v4i32_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vctp.32 r1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vdupt.32 q1, r0
-; CHECK-NEXT:    vsub.i32 q0, q0, q1
+; CHECK-NEXT:    vsubt.i32 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -943,11 +908,9 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @subqr_v8i16_x(<8 x i16> %x, i16 %y, i32 %n) {
 ; CHECK-LABEL: subqr_v8i16_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vctp.16 r1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vdupt.16 q1, r0
-; CHECK-NEXT:    vsub.i16 q0, q0, q1
+; CHECK-NEXT:    vsubt.i16 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -961,11 +924,9 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @subqr_v16i8_x(<16 x i8> %x, i8 %y, i32 %n) {
 ; CHECK-LABEL: subqr_v16i8_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vctp.8 r1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vdupt.8 q1, r0
-; CHECK-NEXT:    vsub.i8 q0, q0, q1
+; CHECK-NEXT:    vsubt.i8 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -979,11 +940,9 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @mulqr_v4i32_x(<4 x i32> %x, i32 %y, i32 %n) {
 ; CHECK-LABEL: mulqr_v4i32_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q1, #0x1
 ; CHECK-NEXT:    vctp.32 r1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vdupt.32 q1, r0
-; CHECK-NEXT:    vmul.i32 q0, q1, q0
+; CHECK-NEXT:    vmult.i32 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -997,11 +956,9 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @mulqr_v8i16_x(<8 x i16> %x, i16 %y, i32 %n) {
 ; CHECK-LABEL: mulqr_v8i16_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i16 q1, #0x1
 ; CHECK-NEXT:    vctp.16 r1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vdupt.16 q1, r0
-; CHECK-NEXT:    vmul.i16 q0, q1, q0
+; CHECK-NEXT:    vmult.i16 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -1015,11 +972,9 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @mulqr_v16i8_x(<16 x i8> %x, i8 %y, i32 %n) {
 ; CHECK-LABEL: mulqr_v16i8_x:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q1, #0x1
 ; CHECK-NEXT:    vctp.8 r1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vdupt.8 q1, r0
-; CHECK-NEXT:    vmul.i8 q0, q1, q0
+; CHECK-NEXT:    vmult.i8 q0, q0, r0
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -1327,10 +1282,10 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @add_v4i32_y(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: add_v4i32_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.32 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i32 q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -1342,10 +1297,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @add_v8i16_y(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: add_v8i16_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.16 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i16 q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -1357,10 +1312,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @add_v16i8_y(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: add_v16i8_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.8 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vadd.i8 q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i8 q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -1417,10 +1372,10 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @mul_v4i32_y(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: mul_v4i32_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x1
 ; CHECK-NEXT:    vctp.32 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vmul.i32 q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i32 q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -1432,10 +1387,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @mul_v8i16_y(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: mul_v8i16_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i16 q2, #0x1
 ; CHECK-NEXT:    vctp.16 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vmul.i16 q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i16 q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -1447,10 +1402,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @mul_v16i8_y(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: mul_v16i8_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q2, #0x1
 ; CHECK-NEXT:    vctp.8 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vmul.i8 q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i8 q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -1462,10 +1417,10 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @and_v4i32_y(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: and_v4i32_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vctp.32 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -1477,10 +1432,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @and_v8i16_y(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: and_v8i16_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vctp.16 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -1492,10 +1447,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @and_v16i8_y(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: and_v16i8_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vctp.8 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -1507,10 +1462,10 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @or_v4i32_y(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: or_v4i32_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.32 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -1522,10 +1477,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @or_v8i16_y(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: or_v8i16_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.16 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -1537,10 +1492,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @or_v16i8_y(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: or_v16i8_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.8 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -1552,10 +1507,10 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @xor_v4i32_y(<4 x i32> %x, <4 x i32> %y, i32 %n) {
 ; CHECK-LABEL: xor_v4i32_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.32 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -1567,10 +1522,10 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @xor_v8i16_y(<8 x i16> %x, <8 x i16> %y, i32 %n) {
 ; CHECK-LABEL: xor_v8i16_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.16 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -1582,10 +1537,10 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @xor_v16i8_y(<16 x i8> %x, <16 x i8> %y, i32 %n) {
 ; CHECK-LABEL: xor_v16i8_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    vctp.8 r0
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -2219,10 +2174,11 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @addqr_v4i32_y(<4 x i32> %x, i32 %y, i32 %n) {
 ; CHECK-LABEL: addqr_v4i32_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vdup.32 q1, r0
 ; CHECK-NEXT:    vctp.32 r1
-; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i32 q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -2236,10 +2192,11 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @addqr_v8i16_y(<8 x i16> %x, i16 %y, i32 %n) {
 ; CHECK-LABEL: addqr_v8i16_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vdup.16 q1, r0
 ; CHECK-NEXT:    vctp.16 r1
-; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    vadd.i16 q0, q0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i16 q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -2253,10 +2210,11 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @addqr_v16i8_y(<16 x i8> %x, i8 %y, i32 %n) {
 ; CHECK-LABEL: addqr_v16i8_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vdup.8 q1, r0
 ; CHECK-NEXT:    vctp.8 r1
-; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    vadd.i8 q0, q0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i8 q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)
@@ -2324,10 +2282,11 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @mulqr_v4i32_y(<4 x i32> %x, i32 %y, i32 %n) {
 ; CHECK-LABEL: mulqr_v4i32_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i32 q1, #0x1
+; CHECK-NEXT:    vdup.32 q1, r0
 ; CHECK-NEXT:    vctp.32 r1
-; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    vmul.i32 q0, q0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i32 q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <4 x i1> @llvm.arm.mve.vctp32(i32 %n)
@@ -2341,10 +2300,11 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @mulqr_v8i16_y(<8 x i16> %x, i16 %y, i32 %n) {
 ; CHECK-LABEL: mulqr_v8i16_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i16 q1, #0x1
+; CHECK-NEXT:    vdup.16 q1, r0
 ; CHECK-NEXT:    vctp.16 r1
-; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    vmul.i16 q0, q0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i16 q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
@@ -2358,10 +2318,11 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @mulqr_v16i8_y(<16 x i8> %x, i8 %y, i32 %n) {
 ; CHECK-LABEL: mulqr_v16i8_y:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.i8 q1, #0x1
+; CHECK-NEXT:    vdup.8 q1, r0
 ; CHECK-NEXT:    vctp.8 r1
-; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    vmul.i8 q0, q0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i8 q1, q1, q0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 entry:
   %c = call <16 x i1> @llvm.arm.mve.vctp8(i32 %n)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
index 918e2db6e96e..c8b6dd0b3a4f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -214,55 +214,54 @@ entry:
 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-LABEL: add_v8i16_v8i32_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vcmp.i16 eq, q1, zr
+; CHECK-NEXT:    vmov.i32 q4, #0xffff
 ; CHECK-NEXT:    vpsel q1, q3, q2
-; CHECK-NEXT:    vmov.i32 q3, #0xffff
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
 ; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
 ; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
 ; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q3[3], r0
 ; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vandt q4, q5, q3
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vandt q2, q3, q4
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
 ; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
 ; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
 ; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vpt.i32 ne, q5, zr
-; CHECK-NEXT:    vandt q2, q1, q3
-; CHECK-NEXT:    vadd.i32 q0, q2, q4
-; CHECK-NEXT:    vaddv.u32 r0, q0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vmovlb.u16 q0, q1
+; CHECK-NEXT:    vpt.i32 ne, q3, zr
+; CHECK-NEXT:    vaddt.i32 q2, q2, q0
+; CHECK-NEXT:    vaddv.u32 r0, q2
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
@@ -275,54 +274,50 @@ entry:
 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %b) {
 ; CHECK-LABEL: add_v8i16_v8i32_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vcmp.i16 eq, q1, zr
 ; CHECK-NEXT:    vpsel q1, q3, q2
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
 ; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
 ; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
 ; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
 ; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
 ; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
 ; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q4[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmovlb.s16 q2, q2
+; CHECK-NEXT:    vpsel q2, q2, q3
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
 ; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
 ; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
-; CHECK-NEXT:    vmovlb.s16 q3, q2
-; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
 ; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vpsel q3, q3, q2
-; CHECK-NEXT:    vcmp.i32 ne, q4, zr
 ; CHECK-NEXT:    vmovlb.s16 q0, q1
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vadd.i32 q0, q0, q3
-; CHECK-NEXT:    vaddv.u32 r0, q0
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpt.i32 ne, q3, zr
+; CHECK-NEXT:    vaddt.i32 q2, q2, q0
+; CHECK-NEXT:    vaddv.u32 r0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
@@ -744,129 +739,128 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
-; CHECK-NEXT:    vmov.i8 q4, #0xff
-; CHECK-NEXT:    vpsel q6, q4, q0
+; CHECK-NEXT:    vmov.i8 q5, #0xff
+; CHECK-NEXT:    vpsel q7, q5, q0
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.u8 r0, q6[8]
-; CHECK-NEXT:    vmov.i32 q5, #0xff
+; CHECK-NEXT:    vmov.u8 r0, q7[0]
+; CHECK-NEXT:    vmov.i32 q6, #0xff
 ; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[9]
+; CHECK-NEXT:    vmov.u8 r0, q7[1]
 ; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[10]
+; CHECK-NEXT:    vmov.u8 r0, q7[2]
 ; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[11]
+; CHECK-NEXT:    vmov.u8 r0, q7[3]
 ; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[12]
+; CHECK-NEXT:    vmov.u8 r0, q7[4]
 ; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[13]
+; CHECK-NEXT:    vmov.u8 r0, q7[5]
 ; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[14]
+; CHECK-NEXT:    vmov.u8 r0, q7[6]
 ; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[15]
+; CHECK-NEXT:    vmov.u8 r0, q7[7]
 ; CHECK-NEXT:    vmov.16 q1[7], r0
 ; CHECK-NEXT:    vcmp.i16 ne, q1, zr
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    vpsel q3, q4, q0
-; CHECK-NEXT:    vmov q7, q1
-; CHECK-NEXT:    vmov.u16 r0, q3[4]
+; CHECK-NEXT:    vpsel q4, q5, q0
+; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
 ; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmov.u16 r0, q4[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[6]
+; CHECK-NEXT:    vmov.u16 r0, q4[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[7]
+; CHECK-NEXT:    vmov.u16 r0, q4[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[12]
+; CHECK-NEXT:    vmov.u8 r0, q2[4]
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.u8 r0, q2[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.u8 r0, q2[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[15]
+; CHECK-NEXT:    vmov.u8 r0, q2[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[0]
+; CHECK-NEXT:    vmov.u8 r0, q7[8]
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vandt q7, q0, q5
+; CHECK-NEXT:    vandt q3, q0, q6
 ; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[1]
+; CHECK-NEXT:    vmov.u8 r0, q7[9]
 ; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[2]
+; CHECK-NEXT:    vmov.u8 r0, q7[10]
 ; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[3]
+; CHECK-NEXT:    vmov.u8 r0, q7[11]
 ; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[4]
+; CHECK-NEXT:    vmov.u8 r0, q7[12]
 ; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[5]
+; CHECK-NEXT:    vmov.u8 r0, q7[13]
 ; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[6]
+; CHECK-NEXT:    vmov.u8 r0, q7[14]
 ; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[7]
+; CHECK-NEXT:    vmov.u8 r0, q7[15]
 ; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vmov q6, q1
 ; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vpsel q4, q4, q0
-; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vpsel q5, q5, q0
+; CHECK-NEXT:    vmov.u16 r0, q5[4]
 ; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[5]
+; CHECK-NEXT:    vmov.u16 r0, q5[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-NEXT:    vmov.u16 r0, q5[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-NEXT:    vmov.u16 r0, q5[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-NEXT:    vmov.u8 r0, q2[12]
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[5]
+; CHECK-NEXT:    vmov.u8 r0, q2[13]
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[6]
+; CHECK-NEXT:    vmov.u8 r0, q2[14]
 ; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-NEXT:    vmov.u8 r0, q2[15]
 ; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vandt q6, q0, q5
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vadd.i32 q0, q6, q7
-; CHECK-NEXT:    vmov.32 q6[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[1]
-; CHECK-NEXT:    vmov.32 q6[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.32 q6[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.32 q6[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[8]
-; CHECK-NEXT:    vcmp.i32 ne, q6, zr
-; CHECK-NEXT:    vmov.32 q6[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[9]
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vmov.32 q6[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[10]
-; CHECK-NEXT:    vmov.32 q6[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[11]
-; CHECK-NEXT:    vmov.32 q6[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-NEXT:    vand q0, q0, q6
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vandt q3, q6, q5
-; CHECK-NEXT:    vmov.32 q6[0], r0
+; CHECK-NEXT:    vaddt.i32 q3, q3, q0
+; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q4[1]
-; CHECK-NEXT:    vmov.32 q6[1], r0
+; CHECK-NEXT:    vmov.32 q0[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q4[2]
-; CHECK-NEXT:    vmov.32 q6[2], r0
+; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    vmov.u16 r0, q4[3]
-; CHECK-NEXT:    vmov.32 q6[3], r0
+; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[0]
-; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vcmp.i32 ne, q0, zr
+; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[1]
-; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    vmov.32 q0[1], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[2]
-; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[3]
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vpt.i32 ne, q6, zr
-; CHECK-NEXT:    vandt q1, q4, q5
-; CHECK-NEXT:    vadd.i32 q1, q1, q3
-; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[0]
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q1, q0, q6
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[3]
+; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[8]
+; CHECK-NEXT:    vcmp.i32 ne, q0, zr
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[9]
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[11]
+; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vand q0, q0, q6
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i32 q1, q1, q0
+; CHECK-NEXT:    vadd.i32 q0, q1, q3
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
@@ -886,26 +880,27 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-NEXT:    vmov.i8 q3, #0x0
-; CHECK-NEXT:    vmov.i8 q4, #0xff
-; CHECK-NEXT:    vpsel q5, q4, q3
-; CHECK-NEXT:    vmov.u8 r0, q5[8]
+; CHECK-NEXT:    vmov.i8 q5, #0xff
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vpsel q6, q5, q3
+; CHECK-NEXT:    vmov.u8 r0, q6[0]
 ; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[9]
+; CHECK-NEXT:    vmov.u8 r0, q6[1]
 ; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[10]
+; CHECK-NEXT:    vmov.u8 r0, q6[2]
 ; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[11]
+; CHECK-NEXT:    vmov.u8 r0, q6[3]
 ; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[12]
+; CHECK-NEXT:    vmov.u8 r0, q6[4]
 ; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[13]
+; CHECK-NEXT:    vmov.u8 r0, q6[5]
 ; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[14]
+; CHECK-NEXT:    vmov.u8 r0, q6[6]
 ; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[15]
+; CHECK-NEXT:    vmov.u8 r0, q6[7]
 ; CHECK-NEXT:    vmov.16 q1[7], r0
 ; CHECK-NEXT:    vcmp.i16 ne, q1, zr
-; CHECK-NEXT:    vpsel q2, q4, q3
+; CHECK-NEXT:    vpsel q2, q5, q3
 ; CHECK-NEXT:    vmov.u16 r0, q2[4]
 ; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[5]
@@ -914,55 +909,58 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov.32 q1[2], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[7]
 ; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
 ; CHECK-NEXT:    vcmp.i32 ne, q1, zr
 ; CHECK-NEXT:    vmov.32 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
 ; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
 ; CHECK-NEXT:    vmov.32 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
 ; CHECK-NEXT:    vmov.32 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[0]
+; CHECK-NEXT:    vmov.u8 r0, q6[8]
 ; CHECK-NEXT:    vmov.16 q7[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[1]
+; CHECK-NEXT:    vmov.u8 r0, q6[9]
 ; CHECK-NEXT:    vmov.16 q7[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[2]
+; CHECK-NEXT:    vmov.u8 r0, q6[10]
 ; CHECK-NEXT:    vmov.16 q7[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[3]
+; CHECK-NEXT:    vmov.u8 r0, q6[11]
 ; CHECK-NEXT:    vmov.16 q7[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[4]
+; CHECK-NEXT:    vmov.u8 r0, q6[12]
 ; CHECK-NEXT:    vmov.16 q7[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[5]
+; CHECK-NEXT:    vmov.u8 r0, q6[13]
 ; CHECK-NEXT:    vmov.16 q7[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[6]
+; CHECK-NEXT:    vmov.u8 r0, q6[14]
 ; CHECK-NEXT:    vmovlb.s8 q1, q1
 ; CHECK-NEXT:    vmov.16 q7[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q5[7]
-; CHECK-NEXT:    vmovlb.s16 q6, q1
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.u8 r0, q6[15]
+; CHECK-NEXT:    vmovlb.s16 q1, q1
 ; CHECK-NEXT:    vmov.16 q7[7], r0
-; CHECK-NEXT:    vpsel q6, q6, q1
+; CHECK-NEXT:    vpsel q1, q1, q4
 ; CHECK-NEXT:    vcmp.i16 ne, q7, zr
-; CHECK-NEXT:    vpsel q3, q4, q3
+; CHECK-NEXT:    vpsel q3, q5, q3
 ; CHECK-NEXT:    vmov.u16 r0, q3[4]
-; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.32 q5[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    vmov.32 q5[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[6]
-; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q5[2], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[7]
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vcmp.i32 ne, q4, zr
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.32 q4[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vmov.32 q5[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vcmp.i32 ne, q5, zr
+; CHECK-NEXT:    vmov.32 q5[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.32 q5[3], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[0]
+; CHECK-NEXT:    vmovlb.s8 q5, q5
+; CHECK-NEXT:    vmovlb.s16 q5, q5
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i32 q1, q1, q5
 ; CHECK-NEXT:    vmov.32 q5[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[1]
 ; CHECK-NEXT:    vmov.32 q5[1], r0
@@ -970,44 +968,39 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov.32 q5[2], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[3]
 ; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.u8 r0, q0[0]
 ; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
 ; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
-; CHECK-NEXT:    vmovlb.s8 q4, q4
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
 ; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
-; CHECK-NEXT:    vmovlb.s16 q4, q4
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
 ; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vpsel q4, q4, q1
 ; CHECK-NEXT:    vcmp.i32 ne, q5, zr
-; CHECK-NEXT:    vmov.32 q5[0], r0
+; CHECK-NEXT:    vmovlb.s8 q2, q2
+; CHECK-NEXT:    vmov.u16 r0, q3[0]
+; CHECK-NEXT:    vmovlb.s16 q2, q2
+; CHECK-NEXT:    vpsel q2, q2, q4
+; CHECK-NEXT:    vmov.32 q4[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[1]
-; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    vmov.32 q4[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[2]
-; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.32 q4[2], r0
 ; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
+; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
 ; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
 ; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
 ; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmovlb.s8 q2, q2
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
 ; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmovlb.s16 q2, q2
 ; CHECK-NEXT:    vmovlb.s8 q0, q3
-; CHECK-NEXT:    vpsel q2, q2, q1
-; CHECK-NEXT:    vcmp.i32 ne, q5, zr
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vadd.i32 q4, q4, q6
-; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q0, q2
-; CHECK-NEXT:    vadd.i32 q0, q0, q4
+; CHECK-NEXT:    vpt.i32 ne, q4, zr
+; CHECK-NEXT:    vaddt.i32 q2, q2, q0
+; CHECK-NEXT:    vadd.i32 q0, q2, q1
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
@@ -1061,87 +1054,83 @@ entry:
 define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-LABEL: add_v16i8_v16i16_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-NEXT:    vmov.i8 q1, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    vpsel q1, q2, q1
-; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
 ; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
 ; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
 ; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
 ; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
 ; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
 ; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.u8 r0, q0[0]
 ; CHECK-NEXT:    vcmp.i16 ne, q2, zr
 ; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
 ; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
 ; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
 ; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
 ; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
 ; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q4[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q4[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q4[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmovlb.u8 q2, q2
+; CHECK-NEXT:    vpsel q2, q2, q3
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
 ; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
 ; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
 ; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
 ; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
 ; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
 ; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
 ; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmovlb.u8 q3, q2
-; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
 ; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vpsel q3, q3, q2
-; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vmovlb.u8 q0, q1
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vadd.i16 q0, q0, q3
-; CHECK-NEXT:    vaddv.u16 r0, q0
+; CHECK-NEXT:    vpt.i16 ne, q3, zr
+; CHECK-NEXT:    vaddt.i16 q2, q2, q0
+; CHECK-NEXT:    vaddv.u16 r0, q2
 ; CHECK-NEXT:    uxth r0, r0
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
@@ -1154,87 +1143,83 @@ entry:
 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-LABEL: add_v16i8_v16i16_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-NEXT:    vmov.i8 q1, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    vpsel q1, q2, q1
-; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
 ; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
 ; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
 ; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
 ; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
 ; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
 ; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.u8 r0, q0[0]
 ; CHECK-NEXT:    vcmp.i16 ne, q2, zr
 ; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
 ; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
 ; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
 ; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
 ; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
 ; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q4[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q4[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q4[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q4[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q4[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q4[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q4[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmovlb.s8 q2, q2
+; CHECK-NEXT:    vpsel q2, q2, q3
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
 ; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
 ; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
 ; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
 ; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
 ; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
 ; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
 ; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmovlb.s8 q3, q2
-; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
 ; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vpsel q3, q3, q2
-; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vmovlb.s8 q0, q1
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vadd.i16 q0, q0, q3
-; CHECK-NEXT:    vaddv.u16 r0, q0
+; CHECK-NEXT:    vpt.i16 ne, q3, zr
+; CHECK-NEXT:    vaddt.i16 q2, q2, q0
+; CHECK-NEXT:    vaddv.u16 r0, q2
 ; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
@@ -2226,55 +2211,54 @@ entry:
 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %b, i32 %a) {
 ; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vcmp.i16 eq, q1, zr
+; CHECK-NEXT:    vmov.i32 q4, #0xffff
 ; CHECK-NEXT:    vpsel q1, q3, q2
-; CHECK-NEXT:    vmov.i32 q3, #0xffff
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
 ; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
 ; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[6]
+; CHECK-NEXT:    vmov.u16 r1, q1[2]
 ; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[7]
+; CHECK-NEXT:    vmov.u16 r1, q1[3]
 ; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmov.32 q5[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    vmov.32 q5[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmov.32 q3[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmov.32 q5[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.32 q3[3], r1
 ; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    vmov.32 q5[3], r1
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
+; CHECK-NEXT:    vmov.u16 r1, q1[4]
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vandt q4, q5, q3
-; CHECK-NEXT:    vmov.32 q5[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov.32 q5[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vmov.32 q5[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov.32 q5[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vandt q2, q3, q4
+; CHECK-NEXT:    vmov.32 q3[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q1[6]
+; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q1[7]
+; CHECK-NEXT:    vmov.32 q3[3], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
 ; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov.u16 r1, q0[7]
 ; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vpt.i32 ne, q5, zr
-; CHECK-NEXT:    vandt q2, q1, q3
-; CHECK-NEXT:    vadd.i32 q0, q2, q4
-; CHECK-NEXT:    vaddva.u32 r0, q0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vmovlb.u16 q0, q1
+; CHECK-NEXT:    vpt.i32 ne, q3, zr
+; CHECK-NEXT:    vaddt.i32 q2, q2, q0
+; CHECK-NEXT:    vaddva.u32 r0, q2
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
@@ -2288,54 +2272,50 @@ entry:
 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %b, i32 %a) {
 ; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vcmp.i16 eq, q1, zr
 ; CHECK-NEXT:    vpsel q1, q3, q2
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
 ; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
 ; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[6]
+; CHECK-NEXT:    vmov.u16 r1, q1[2]
 ; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[7]
+; CHECK-NEXT:    vmov.u16 r1, q1[3]
 ; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
 ; CHECK-NEXT:    vcmp.i32 ne, q2, zr
 ; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
 ; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
 ; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
 ; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmov.32 q4[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov.32 q4[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vmov.32 q4[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov.32 q4[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmov.u16 r1, q1[4]
+; CHECK-NEXT:    vmovlb.s16 q2, q2
+; CHECK-NEXT:    vpsel q2, q2, q3
+; CHECK-NEXT:    vmov.32 q3[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q1[6]
+; CHECK-NEXT:    vmov.32 q3[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q1[7]
+; CHECK-NEXT:    vmov.32 q3[3], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
 ; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    vmovlb.s16 q3, q2
-; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.u16 r1, q0[7]
 ; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vpsel q3, q3, q2
-; CHECK-NEXT:    vcmp.i32 ne, q4, zr
 ; CHECK-NEXT:    vmovlb.s16 q0, q1
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vadd.i32 q0, q0, q3
-; CHECK-NEXT:    vaddva.u32 r0, q0
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpt.i32 ne, q3, zr
+; CHECK-NEXT:    vaddt.i32 q2, q2, q0
+; CHECK-NEXT:    vaddva.u32 r0, q2
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
@@ -2779,129 +2759,128 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
-; CHECK-NEXT:    vmov.i8 q4, #0xff
-; CHECK-NEXT:    vpsel q6, q4, q0
+; CHECK-NEXT:    vmov.i8 q5, #0xff
+; CHECK-NEXT:    vpsel q7, q5, q0
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.u8 r1, q6[8]
-; CHECK-NEXT:    vmov.i32 q5, #0xff
+; CHECK-NEXT:    vmov.u8 r1, q7[0]
+; CHECK-NEXT:    vmov.i32 q6, #0xff
 ; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[9]
+; CHECK-NEXT:    vmov.u8 r1, q7[1]
 ; CHECK-NEXT:    vmov.16 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[10]
+; CHECK-NEXT:    vmov.u8 r1, q7[2]
 ; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[11]
+; CHECK-NEXT:    vmov.u8 r1, q7[3]
 ; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[12]
+; CHECK-NEXT:    vmov.u8 r1, q7[4]
 ; CHECK-NEXT:    vmov.16 q1[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[13]
+; CHECK-NEXT:    vmov.u8 r1, q7[5]
 ; CHECK-NEXT:    vmov.16 q1[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[14]
+; CHECK-NEXT:    vmov.u8 r1, q7[6]
 ; CHECK-NEXT:    vmov.16 q1[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[15]
+; CHECK-NEXT:    vmov.u8 r1, q7[7]
 ; CHECK-NEXT:    vmov.16 q1[7], r1
 ; CHECK-NEXT:    vcmp.i16 ne, q1, zr
 ; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    vpsel q3, q4, q0
-; CHECK-NEXT:    vmov q7, q1
-; CHECK-NEXT:    vmov.u16 r1, q3[4]
+; CHECK-NEXT:    vpsel q4, q5, q0
+; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vmov.u16 r1, q4[4]
 ; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q3[5]
+; CHECK-NEXT:    vmov.u16 r1, q4[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q3[6]
+; CHECK-NEXT:    vmov.u16 r1, q4[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q3[7]
+; CHECK-NEXT:    vmov.u16 r1, q4[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[12]
+; CHECK-NEXT:    vmov.u8 r1, q2[4]
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[13]
+; CHECK-NEXT:    vmov.u8 r1, q2[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[14]
+; CHECK-NEXT:    vmov.u8 r1, q2[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[15]
+; CHECK-NEXT:    vmov.u8 r1, q2[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[0]
+; CHECK-NEXT:    vmov.u8 r1, q7[8]
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vandt q7, q0, q5
+; CHECK-NEXT:    vandt q3, q0, q6
 ; CHECK-NEXT:    vmov.16 q0[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[1]
+; CHECK-NEXT:    vmov.u8 r1, q7[9]
 ; CHECK-NEXT:    vmov.16 q0[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[2]
+; CHECK-NEXT:    vmov.u8 r1, q7[10]
 ; CHECK-NEXT:    vmov.16 q0[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[3]
+; CHECK-NEXT:    vmov.u8 r1, q7[11]
 ; CHECK-NEXT:    vmov.16 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[4]
+; CHECK-NEXT:    vmov.u8 r1, q7[12]
 ; CHECK-NEXT:    vmov.16 q0[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[5]
+; CHECK-NEXT:    vmov.u8 r1, q7[13]
 ; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[6]
+; CHECK-NEXT:    vmov.u8 r1, q7[14]
 ; CHECK-NEXT:    vmov.16 q0[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[7]
+; CHECK-NEXT:    vmov.u8 r1, q7[15]
 ; CHECK-NEXT:    vmov.16 q0[7], r1
-; CHECK-NEXT:    vmov q6, q1
 ; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vpsel q4, q4, q0
-; CHECK-NEXT:    vmov.u16 r1, q4[4]
+; CHECK-NEXT:    vpsel q5, q5, q0
+; CHECK-NEXT:    vmov.u16 r1, q5[4]
 ; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[5]
+; CHECK-NEXT:    vmov.u16 r1, q5[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[6]
+; CHECK-NEXT:    vmov.u16 r1, q5[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[7]
+; CHECK-NEXT:    vmov.u16 r1, q5[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[4]
+; CHECK-NEXT:    vmov.u8 r1, q2[12]
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[5]
+; CHECK-NEXT:    vmov.u8 r1, q2[13]
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[6]
+; CHECK-NEXT:    vmov.u8 r1, q2[14]
 ; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[7]
+; CHECK-NEXT:    vmov.u8 r1, q2[15]
 ; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vandt q6, q0, q5
-; CHECK-NEXT:    vmov.u16 r1, q3[0]
-; CHECK-NEXT:    vadd.i32 q0, q6, q7
-; CHECK-NEXT:    vmov.32 q6[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q3[1]
-; CHECK-NEXT:    vmov.32 q6[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q3[2]
-; CHECK-NEXT:    vmov.32 q6[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q3[3]
-; CHECK-NEXT:    vmov.32 q6[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[8]
-; CHECK-NEXT:    vcmp.i32 ne, q6, zr
-; CHECK-NEXT:    vmov.32 q6[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[9]
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vmov.32 q6[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[10]
-; CHECK-NEXT:    vmov.32 q6[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[11]
-; CHECK-NEXT:    vmov.32 q6[3], r1
 ; CHECK-NEXT:    vmov.u16 r1, q4[0]
+; CHECK-NEXT:    vand q0, q0, q6
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vandt q3, q6, q5
-; CHECK-NEXT:    vmov.32 q6[0], r1
+; CHECK-NEXT:    vaddt.i32 q3, q3, q0
+; CHECK-NEXT:    vmov.32 q0[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q4[1]
-; CHECK-NEXT:    vmov.32 q6[1], r1
+; CHECK-NEXT:    vmov.32 q0[1], r1
 ; CHECK-NEXT:    vmov.u16 r1, q4[2]
-; CHECK-NEXT:    vmov.32 q6[2], r1
+; CHECK-NEXT:    vmov.32 q0[2], r1
 ; CHECK-NEXT:    vmov.u16 r1, q4[3]
-; CHECK-NEXT:    vmov.32 q6[3], r1
+; CHECK-NEXT:    vmov.32 q0[3], r1
 ; CHECK-NEXT:    vmov.u8 r1, q2[0]
-; CHECK-NEXT:    vmov.32 q4[0], r1
+; CHECK-NEXT:    vcmp.i32 ne, q0, zr
+; CHECK-NEXT:    vmov.32 q0[0], r1
 ; CHECK-NEXT:    vmov.u8 r1, q2[1]
-; CHECK-NEXT:    vmov.32 q4[1], r1
+; CHECK-NEXT:    vmov.32 q0[1], r1
 ; CHECK-NEXT:    vmov.u8 r1, q2[2]
-; CHECK-NEXT:    vmov.32 q4[2], r1
+; CHECK-NEXT:    vmov.32 q0[2], r1
 ; CHECK-NEXT:    vmov.u8 r1, q2[3]
-; CHECK-NEXT:    vmov.32 q4[3], r1
-; CHECK-NEXT:    vpt.i32 ne, q6, zr
-; CHECK-NEXT:    vandt q1, q4, q5
-; CHECK-NEXT:    vadd.i32 q1, q1, q3
-; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    vmov.u16 r1, q5[0]
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q1, q0, q6
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q5[1]
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q5[2]
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q5[3]
+; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[8]
+; CHECK-NEXT:    vcmp.i32 ne, q0, zr
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[9]
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[10]
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[11]
+; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    vand q0, q0, q6
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i32 q1, q1, q0
+; CHECK-NEXT:    vadd.i32 q0, q1, q3
 ; CHECK-NEXT:    vaddva.u32 r0, q0
 ; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
@@ -2922,26 +2901,27 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-NEXT:    vmov.i8 q3, #0x0
-; CHECK-NEXT:    vmov.i8 q4, #0xff
-; CHECK-NEXT:    vpsel q5, q4, q3
-; CHECK-NEXT:    vmov.u8 r1, q5[8]
+; CHECK-NEXT:    vmov.i8 q5, #0xff
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vpsel q6, q5, q3
+; CHECK-NEXT:    vmov.u8 r1, q6[0]
 ; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[9]
+; CHECK-NEXT:    vmov.u8 r1, q6[1]
 ; CHECK-NEXT:    vmov.16 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[10]
+; CHECK-NEXT:    vmov.u8 r1, q6[2]
 ; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[11]
+; CHECK-NEXT:    vmov.u8 r1, q6[3]
 ; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[12]
+; CHECK-NEXT:    vmov.u8 r1, q6[4]
 ; CHECK-NEXT:    vmov.16 q1[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[13]
+; CHECK-NEXT:    vmov.u8 r1, q6[5]
 ; CHECK-NEXT:    vmov.16 q1[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[14]
+; CHECK-NEXT:    vmov.u8 r1, q6[6]
 ; CHECK-NEXT:    vmov.16 q1[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[15]
+; CHECK-NEXT:    vmov.u8 r1, q6[7]
 ; CHECK-NEXT:    vmov.16 q1[7], r1
 ; CHECK-NEXT:    vcmp.i16 ne, q1, zr
-; CHECK-NEXT:    vpsel q2, q4, q3
+; CHECK-NEXT:    vpsel q2, q5, q3
 ; CHECK-NEXT:    vmov.u16 r1, q2[4]
 ; CHECK-NEXT:    vmov.32 q1[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q2[5]
@@ -2950,55 +2930,58 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov.32 q1[2], r1
 ; CHECK-NEXT:    vmov.u16 r1, q2[7]
 ; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
 ; CHECK-NEXT:    vcmp.i32 ne, q1, zr
 ; CHECK-NEXT:    vmov.32 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    vmov.u8 r1, q0[5]
 ; CHECK-NEXT:    vmov.32 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
+; CHECK-NEXT:    vmov.u8 r1, q0[6]
 ; CHECK-NEXT:    vmov.32 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[7]
 ; CHECK-NEXT:    vmov.32 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[0]
+; CHECK-NEXT:    vmov.u8 r1, q6[8]
 ; CHECK-NEXT:    vmov.16 q7[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[1]
+; CHECK-NEXT:    vmov.u8 r1, q6[9]
 ; CHECK-NEXT:    vmov.16 q7[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[2]
+; CHECK-NEXT:    vmov.u8 r1, q6[10]
 ; CHECK-NEXT:    vmov.16 q7[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[3]
+; CHECK-NEXT:    vmov.u8 r1, q6[11]
 ; CHECK-NEXT:    vmov.16 q7[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[4]
+; CHECK-NEXT:    vmov.u8 r1, q6[12]
 ; CHECK-NEXT:    vmov.16 q7[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[5]
+; CHECK-NEXT:    vmov.u8 r1, q6[13]
 ; CHECK-NEXT:    vmov.16 q7[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[6]
+; CHECK-NEXT:    vmov.u8 r1, q6[14]
 ; CHECK-NEXT:    vmovlb.s8 q1, q1
 ; CHECK-NEXT:    vmov.16 q7[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q5[7]
-; CHECK-NEXT:    vmovlb.s16 q6, q1
-; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.u8 r1, q6[15]
+; CHECK-NEXT:    vmovlb.s16 q1, q1
 ; CHECK-NEXT:    vmov.16 q7[7], r1
-; CHECK-NEXT:    vpsel q6, q6, q1
+; CHECK-NEXT:    vpsel q1, q1, q4
 ; CHECK-NEXT:    vcmp.i16 ne, q7, zr
-; CHECK-NEXT:    vpsel q3, q4, q3
+; CHECK-NEXT:    vpsel q3, q5, q3
 ; CHECK-NEXT:    vmov.u16 r1, q3[4]
-; CHECK-NEXT:    vmov.32 q4[0], r1
+; CHECK-NEXT:    vmov.32 q5[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q3[5]
-; CHECK-NEXT:    vmov.32 q4[1], r1
+; CHECK-NEXT:    vmov.32 q5[1], r1
 ; CHECK-NEXT:    vmov.u16 r1, q3[6]
-; CHECK-NEXT:    vmov.32 q4[2], r1
+; CHECK-NEXT:    vmov.32 q5[2], r1
 ; CHECK-NEXT:    vmov.u16 r1, q3[7]
-; CHECK-NEXT:    vmov.32 q4[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    vcmp.i32 ne, q4, zr
-; CHECK-NEXT:    vmov.32 q4[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    vmov.32 q4[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    vmov.32 q4[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmov.32 q4[3], r1
+; CHECK-NEXT:    vmov.32 q5[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
+; CHECK-NEXT:    vcmp.i32 ne, q5, zr
+; CHECK-NEXT:    vmov.32 q5[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    vmov.32 q5[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[14]
+; CHECK-NEXT:    vmov.32 q5[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.32 q5[3], r1
 ; CHECK-NEXT:    vmov.u16 r1, q2[0]
+; CHECK-NEXT:    vmovlb.s8 q5, q5
+; CHECK-NEXT:    vmovlb.s16 q5, q5
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i32 q1, q1, q5
 ; CHECK-NEXT:    vmov.32 q5[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q2[1]
 ; CHECK-NEXT:    vmov.32 q5[1], r1
@@ -3006,44 +2989,39 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov.32 q5[2], r1
 ; CHECK-NEXT:    vmov.u16 r1, q2[3]
 ; CHECK-NEXT:    vmov.32 q5[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
+; CHECK-NEXT:    vmov.u8 r1, q0[0]
 ; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
 ; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    vmovlb.s8 q4, q4
+; CHECK-NEXT:    vmov.u8 r1, q0[2]
 ; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    vmovlb.s16 q4, q4
+; CHECK-NEXT:    vmov.u8 r1, q0[3]
 ; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q3[0]
-; CHECK-NEXT:    vpsel q4, q4, q1
 ; CHECK-NEXT:    vcmp.i32 ne, q5, zr
-; CHECK-NEXT:    vmov.32 q5[0], r1
+; CHECK-NEXT:    vmovlb.s8 q2, q2
+; CHECK-NEXT:    vmov.u16 r1, q3[0]
+; CHECK-NEXT:    vmovlb.s16 q2, q2
+; CHECK-NEXT:    vpsel q2, q2, q4
+; CHECK-NEXT:    vmov.32 q4[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q3[1]
-; CHECK-NEXT:    vmov.32 q5[1], r1
+; CHECK-NEXT:    vmov.32 q4[1], r1
 ; CHECK-NEXT:    vmov.u16 r1, q3[2]
-; CHECK-NEXT:    vmov.32 q5[2], r1
+; CHECK-NEXT:    vmov.32 q4[2], r1
 ; CHECK-NEXT:    vmov.u16 r1, q3[3]
-; CHECK-NEXT:    vmov.32 q5[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
+; CHECK-NEXT:    vmov.32 q4[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov.u8 r1, q0[9]
 ; CHECK-NEXT:    vmov.32 q3[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
+; CHECK-NEXT:    vmov.u8 r1, q0[10]
 ; CHECK-NEXT:    vmov.32 q3[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    vmovlb.s8 q2, q2
+; CHECK-NEXT:    vmov.u8 r1, q0[11]
 ; CHECK-NEXT:    vmov.32 q3[3], r1
-; CHECK-NEXT:    vmovlb.s16 q2, q2
 ; CHECK-NEXT:    vmovlb.s8 q0, q3
-; CHECK-NEXT:    vpsel q2, q2, q1
-; CHECK-NEXT:    vcmp.i32 ne, q5, zr
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vadd.i32 q4, q4, q6
-; CHECK-NEXT:    vpsel q0, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q0, q2
-; CHECK-NEXT:    vadd.i32 q0, q0, q4
+; CHECK-NEXT:    vpt.i32 ne, q4, zr
+; CHECK-NEXT:    vaddt.i32 q2, q2, q0
+; CHECK-NEXT:    vadd.i32 q0, q2, q1
 ; CHECK-NEXT:    vaddva.u32 r0, q0
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
@@ -3100,87 +3078,83 @@ entry:
 define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %b, i16 %a) {
 ; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-NEXT:    vmov.i8 q1, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    vpsel q1, q2, q1
-; CHECK-NEXT:    vmov.u8 r1, q1[8]
+; CHECK-NEXT:    vmov.u8 r1, q1[0]
 ; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[9]
+; CHECK-NEXT:    vmov.u8 r1, q1[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[10]
+; CHECK-NEXT:    vmov.u8 r1, q1[2]
 ; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[11]
+; CHECK-NEXT:    vmov.u8 r1, q1[3]
 ; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[12]
+; CHECK-NEXT:    vmov.u8 r1, q1[4]
 ; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[13]
+; CHECK-NEXT:    vmov.u8 r1, q1[5]
 ; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[14]
+; CHECK-NEXT:    vmov.u8 r1, q1[6]
 ; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[15]
+; CHECK-NEXT:    vmov.u8 r1, q1[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
+; CHECK-NEXT:    vmov.u8 r1, q0[0]
 ; CHECK-NEXT:    vcmp.i16 ne, q2, zr
 ; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
+; CHECK-NEXT:    vmov.u8 r1, q0[2]
 ; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
+; CHECK-NEXT:    vmov.u8 r1, q0[3]
 ; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
 ; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    vmov.u8 r1, q0[5]
 ; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
+; CHECK-NEXT:    vmov.u8 r1, q0[6]
 ; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[0]
-; CHECK-NEXT:    vmov.16 q4[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[1]
-; CHECK-NEXT:    vmov.16 q4[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[2]
-; CHECK-NEXT:    vmov.16 q4[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[3]
-; CHECK-NEXT:    vmov.16 q4[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[4]
-; CHECK-NEXT:    vmov.16 q4[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[5]
-; CHECK-NEXT:    vmov.16 q4[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[6]
-; CHECK-NEXT:    vmov.16 q4[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[7]
-; CHECK-NEXT:    vmov.16 q4[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
+; CHECK-NEXT:    vmov.u8 r1, q1[8]
+; CHECK-NEXT:    vmovlb.u8 q2, q2
+; CHECK-NEXT:    vpsel q2, q2, q3
+; CHECK-NEXT:    vmov.16 q3[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[9]
+; CHECK-NEXT:    vmov.16 q3[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[10]
+; CHECK-NEXT:    vmov.16 q3[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[11]
+; CHECK-NEXT:    vmov.16 q3[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[12]
+; CHECK-NEXT:    vmov.16 q3[4], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[13]
+; CHECK-NEXT:    vmov.16 q3[5], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[14]
+; CHECK-NEXT:    vmov.16 q3[6], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[15]
+; CHECK-NEXT:    vmov.16 q3[7], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov.u8 r1, q0[9]
 ; CHECK-NEXT:    vmov.16 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
+; CHECK-NEXT:    vmov.u8 r1, q0[10]
 ; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
+; CHECK-NEXT:    vmov.u8 r1, q0[11]
 ; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    vmov.16 q1[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
+; CHECK-NEXT:    vmov.u8 r1, q0[13]
 ; CHECK-NEXT:    vmov.16 q1[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
+; CHECK-NEXT:    vmov.u8 r1, q0[14]
 ; CHECK-NEXT:    vmov.16 q1[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmovlb.u8 q3, q2
-; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.u8 r1, q0[15]
 ; CHECK-NEXT:    vmov.16 q1[7], r1
-; CHECK-NEXT:    vpsel q3, q3, q2
-; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vmovlb.u8 q0, q1
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vadd.i16 q0, q0, q3
-; CHECK-NEXT:    vaddva.u16 r0, q0
+; CHECK-NEXT:    vpt.i16 ne, q3, zr
+; CHECK-NEXT:    vaddt.i16 q2, q2, q0
+; CHECK-NEXT:    vaddva.u16 r0, q2
 ; CHECK-NEXT:    uxth r0, r0
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
@@ -3194,87 +3168,83 @@ entry:
 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %b, i16 %a) {
 ; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-NEXT:    vmov.i8 q1, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    vpsel q1, q2, q1
-; CHECK-NEXT:    vmov.u8 r1, q1[8]
+; CHECK-NEXT:    vmov.u8 r1, q1[0]
 ; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[9]
+; CHECK-NEXT:    vmov.u8 r1, q1[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[10]
+; CHECK-NEXT:    vmov.u8 r1, q1[2]
 ; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[11]
+; CHECK-NEXT:    vmov.u8 r1, q1[3]
 ; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[12]
+; CHECK-NEXT:    vmov.u8 r1, q1[4]
 ; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[13]
+; CHECK-NEXT:    vmov.u8 r1, q1[5]
 ; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[14]
+; CHECK-NEXT:    vmov.u8 r1, q1[6]
 ; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[15]
+; CHECK-NEXT:    vmov.u8 r1, q1[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
+; CHECK-NEXT:    vmov.u8 r1, q0[0]
 ; CHECK-NEXT:    vcmp.i16 ne, q2, zr
 ; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
+; CHECK-NEXT:    vmov.u8 r1, q0[2]
 ; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
+; CHECK-NEXT:    vmov.u8 r1, q0[3]
 ; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
 ; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    vmov.u8 r1, q0[5]
 ; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
+; CHECK-NEXT:    vmov.u8 r1, q0[6]
 ; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[0]
-; CHECK-NEXT:    vmov.16 q4[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[1]
-; CHECK-NEXT:    vmov.16 q4[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[2]
-; CHECK-NEXT:    vmov.16 q4[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[3]
-; CHECK-NEXT:    vmov.16 q4[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[4]
-; CHECK-NEXT:    vmov.16 q4[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[5]
-; CHECK-NEXT:    vmov.16 q4[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[6]
-; CHECK-NEXT:    vmov.16 q4[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[7]
-; CHECK-NEXT:    vmov.16 q4[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
+; CHECK-NEXT:    vmov.u8 r1, q1[8]
+; CHECK-NEXT:    vmovlb.s8 q2, q2
+; CHECK-NEXT:    vpsel q2, q2, q3
+; CHECK-NEXT:    vmov.16 q3[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[9]
+; CHECK-NEXT:    vmov.16 q3[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[10]
+; CHECK-NEXT:    vmov.16 q3[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[11]
+; CHECK-NEXT:    vmov.16 q3[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[12]
+; CHECK-NEXT:    vmov.16 q3[4], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[13]
+; CHECK-NEXT:    vmov.16 q3[5], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[14]
+; CHECK-NEXT:    vmov.16 q3[6], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[15]
+; CHECK-NEXT:    vmov.16 q3[7], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    vmov.16 q1[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov.u8 r1, q0[9]
 ; CHECK-NEXT:    vmov.16 q1[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
+; CHECK-NEXT:    vmov.u8 r1, q0[10]
 ; CHECK-NEXT:    vmov.16 q1[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
+; CHECK-NEXT:    vmov.u8 r1, q0[11]
 ; CHECK-NEXT:    vmov.16 q1[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    vmov.16 q1[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
+; CHECK-NEXT:    vmov.u8 r1, q0[13]
 ; CHECK-NEXT:    vmov.16 q1[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
+; CHECK-NEXT:    vmov.u8 r1, q0[14]
 ; CHECK-NEXT:    vmov.16 q1[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    vmovlb.s8 q3, q2
-; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.u8 r1, q0[15]
 ; CHECK-NEXT:    vmov.16 q1[7], r1
-; CHECK-NEXT:    vpsel q3, q3, q2
-; CHECK-NEXT:    vcmp.i16 ne, q4, zr
 ; CHECK-NEXT:    vmovlb.s8 q0, q1
-; CHECK-NEXT:    vpsel q0, q0, q2
-; CHECK-NEXT:    vadd.i16 q0, q0, q3
-; CHECK-NEXT:    vaddva.u16 r0, q0
+; CHECK-NEXT:    vpt.i16 ne, q3, zr
+; CHECK-NEXT:    vaddt.i16 q2, q2, q0
+; CHECK-NEXT:    vaddva.u16 r0, q2
 ; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
index e02f3f64dce8..decd0dc9b0dd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -223,74 +223,71 @@ entry:
 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
 ; CHECK-LABEL: add_v8i16_v8i32_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov.i8 q3, #0x0
 ; CHECK-NEXT:    vmov.i8 q4, #0xff
 ; CHECK-NEXT:    vcmp.i16 eq, q2, zr
 ; CHECK-NEXT:    vpsel q2, q4, q3
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
 ; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
 ; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
 ; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
 ; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
 ; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
 ; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmovlb.u16 q5, q3
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmovlb.u16 q4, q3
 ; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
 ; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
 ; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
 ; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmovlb.u16 q6, q3
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmovlb.u16 q5, q3
 ; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q4, q6, q5
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.32 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmovlb.u16 q1, q2
+; CHECK-NEXT:    vmult.i32 q3, q5, q4
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
 ; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
 ; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
 ; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
 ; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmovlb.u16 q0, q2
-; CHECK-NEXT:    vpt.i32 ne, q5, zr
-; CHECK-NEXT:    vmult.i32 q3, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q3, q4
-; CHECK-NEXT:    vaddv.u32 r0, q0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmullb.u16 q0, q1, q2
+; CHECK-NEXT:    vpt.i32 ne, q4, zr
+; CHECK-NEXT:    vaddt.i32 q3, q3, q0
+; CHECK-NEXT:    vaddv.u32 r0, q3
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
@@ -305,74 +302,71 @@ entry:
 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
 ; CHECK-LABEL: add_v8i16_v8i32_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov.i8 q3, #0x0
 ; CHECK-NEXT:    vmov.i8 q4, #0xff
 ; CHECK-NEXT:    vcmp.i16 eq, q2, zr
 ; CHECK-NEXT:    vpsel q2, q4, q3
-; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmov.u16 r0, q2[0]
 ; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.u16 r0, q2[1]
 ; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.u16 r0, q2[2]
 ; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
 ; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-NEXT:    vmov.u16 r0, q1[0]
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
 ; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
 ; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[4]
-; CHECK-NEXT:    vmovlb.s16 q5, q3
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmovlb.s16 q4, q3
 ; CHECK-NEXT:    vmov.32 q3[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
 ; CHECK-NEXT:    vmov.32 q3[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
 ; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
 ; CHECK-NEXT:    vmov.32 q3[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[0]
-; CHECK-NEXT:    vmovlb.s16 q6, q3
+; CHECK-NEXT:    vmov.u16 r0, q2[4]
+; CHECK-NEXT:    vmovlb.s16 q5, q3
 ; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q4, q6, q5
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[1]
-; CHECK-NEXT:    vmov.32 q5[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[2]
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q2[3]
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[0]
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[0]
-; CHECK-NEXT:    vmovlb.s16 q1, q2
+; CHECK-NEXT:    vmult.i32 q3, q5, q4
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q1[4]
 ; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
 ; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
 ; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
 ; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmovlb.s16 q0, q2
-; CHECK-NEXT:    vpt.i32 ne, q5, zr
-; CHECK-NEXT:    vmult.i32 q3, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q3, q4
-; CHECK-NEXT:    vaddv.u32 r0, q0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmullb.s16 q0, q1, q2
+; CHECK-NEXT:    vpt.i32 ne, q4, zr
+; CHECK-NEXT:    vaddt.i32 q3, q3, q0
+; CHECK-NEXT:    vaddv.u32 r0, q3
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
@@ -908,180 +902,180 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #64
 ; CHECK-NEXT:    sub sp, #64
-; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vcmp.i8 eq, q2, zr
-; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vcmp.i8 eq, q3, zr
+; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
-; CHECK-NEXT:    vpsel q7, q2, q0
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov.u8 r0, q7[8]
-; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[9]
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[10]
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[11]
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[12]
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[13]
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[14]
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[15]
-; CHECK-NEXT:    vmov.16 q2[7], r0
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vpsel q1, q3, q0
+; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
+; CHECK-NEXT:    vmov.16 q3[7], r0
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vcmp.i16 ne, q2, zr
-; CHECK-NEXT:    vmov.i32 q2, #0xff
-; CHECK-NEXT:    vpsel q4, q4, q0
-; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vcmp.i16 ne, q3, zr
+; CHECK-NEXT:    vmov.i32 q3, #0xff
+; CHECK-NEXT:    vpsel q5, q5, q0
+; CHECK-NEXT:    vstrw.32 q3, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.u16 r0, q5[4]
 ; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[5]
+; CHECK-NEXT:    vmov.u16 r0, q5[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-NEXT:    vmov.u16 r0, q5[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-NEXT:    vmov.u16 r0, q5[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.u8 r0, q2[4]
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.u8 r0, q2[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.u8 r0, q2[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.u8 r0, q2[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[12]
+; CHECK-NEXT:    vmov.u8 r0, q4[4]
 ; CHECK-NEXT:    vmov.32 q6[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[13]
+; CHECK-NEXT:    vmov.u8 r0, q4[5]
 ; CHECK-NEXT:    vmov.32 q6[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[14]
+; CHECK-NEXT:    vmov.u8 r0, q4[6]
 ; CHECK-NEXT:    vmov.32 q6[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[15]
+; CHECK-NEXT:    vmov.u8 r0, q4[7]
 ; CHECK-NEXT:    vmov.32 q6[3], r0
-; CHECK-NEXT:    vand q5, q0, q2
-; CHECK-NEXT:    vand q0, q6, q2
-; CHECK-NEXT:    vmov.i32 q6, #0x0
-; CHECK-NEXT:    vmov.u8 r0, q7[0]
-; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q6, q0, q5
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vand q7, q0, q3
 ; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[1]
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vand q3, q6, q3
 ; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[2]
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
 ; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[3]
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
 ; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[4]
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
 ; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[5]
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
 ; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[6]
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
 ; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q7[7]
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.i32 q6, #0x0
 ; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i32 q6, q3, q7
 ; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vpsel q5, q5, q0
-; CHECK-NEXT:    vmov.u16 r0, q5[4]
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q5[5]
-; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q5[6]
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q5[7]
-; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[4]
-; CHECK-NEXT:    vmov.32 q7[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[5]
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[12]
+; CHECK-NEXT:    vcmp.i32 ne, q1, zr
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[15]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[12]
+; CHECK-NEXT:    vmov.32 q7[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[13]
 ; CHECK-NEXT:    vmov.32 q7[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[6]
+; CHECK-NEXT:    vmov.u8 r0, q4[14]
 ; CHECK-NEXT:    vmov.32 q7[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[7]
+; CHECK-NEXT:    vmov.u8 r0, q4[15]
 ; CHECK-NEXT:    vmov.32 q7[3], r0
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q7, q7, q2
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q6, q7, q0
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.u16 r0, q4[0]
-; CHECK-NEXT:    vadd.i32 q0, q6, q0
-; CHECK-NEXT:    vmov.32 q6[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[1]
-; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.32 q6[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[2]
-; CHECK-NEXT:    vmov.32 q6[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[3]
-; CHECK-NEXT:    vmov.32 q6[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[8]
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
-; CHECK-NEXT:    vmov.32 q4[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[8]
-; CHECK-NEXT:    vcmp.i32 ne, q6, zr
-; CHECK-NEXT:    vand q6, q4, q2
-; CHECK-NEXT:    vmov.32 q4[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[9]
-; CHECK-NEXT:    vmov.32 q4[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[10]
-; CHECK-NEXT:    vmov.32 q4[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[11]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.32 q4[3], r0
-; CHECK-NEXT:    vand q7, q4, q2
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vand q7, q7, q3
 ; CHECK-NEXT:    vmov.u16 r0, q5[0]
-; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmul.i32 q1, q7, q1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q4, q7, q6
-; CHECK-NEXT:    vmov.32 q6[0], r0
+; CHECK-NEXT:    vaddt.i32 q6, q6, q1
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q5[1]
-; CHECK-NEXT:    vmov.32 q6[1], r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u16 r0, q5[2]
-; CHECK-NEXT:    vmov.32 q6[2], r0
+; CHECK-NEXT:    vmov.32 q1[2], r0
 ; CHECK-NEXT:    vmov.u16 r0, q5[3]
-; CHECK-NEXT:    vmov.32 q6[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q5[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[0]
-; CHECK-NEXT:    vand q1, q5, q2
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[0]
+; CHECK-NEXT:    vcmp.i32 ne, q1, zr
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[1]
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[3]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[0]
 ; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[1]
+; CHECK-NEXT:    vmov.u8 r0, q4[1]
 ; CHECK-NEXT:    vmov.32 q5[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[2]
+; CHECK-NEXT:    vmov.u8 r0, q4[2]
 ; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[3]
+; CHECK-NEXT:    vmov.u8 r0, q4[3]
 ; CHECK-NEXT:    vmov.32 q5[3], r0
-; CHECK-NEXT:    vand q2, q5, q2
-; CHECK-NEXT:    vpt.i32 ne, q6, zr
-; CHECK-NEXT:    vmult.i32 q0, q2, q1
-; CHECK-NEXT:    vadd.i32 q1, q0, q4
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vand q5, q5, q3
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i32 q7, q5, q1
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[8]
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[9]
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[11]
+; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[8]
+; CHECK-NEXT:    vcmp.i32 ne, q1, zr
+; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[9]
+; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[10]
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q4[11]
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vmul.i32 q0, q1, q0
+; CHECK-NEXT:    vmov q1, q7
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i32 q1, q7, q0
+; CHECK-NEXT:    vadd.i32 q0, q1, q6
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
@@ -1101,191 +1095,185 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #64
-; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
-; CHECK-NEXT:    vmov.i8 q2, #0x0
-; CHECK-NEXT:    vmov.i8 q0, #0xff
-; CHECK-NEXT:    vpsel q6, q0, q2
+; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q7, q2, q0
 ; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov.u8 r0, q6[8]
-; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.u8 r0, q7[0]
+; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[9]
+; CHECK-NEXT:    vmov.u8 r0, q7[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[10]
+; CHECK-NEXT:    vmov.u8 r0, q7[2]
 ; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[11]
+; CHECK-NEXT:    vmov.u8 r0, q7[3]
 ; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[12]
+; CHECK-NEXT:    vmov.u8 r0, q7[4]
 ; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[13]
+; CHECK-NEXT:    vmov.u8 r0, q7[5]
 ; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[14]
+; CHECK-NEXT:    vmov.u8 r0, q7[6]
 ; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[15]
+; CHECK-NEXT:    vmov.u8 r0, q7[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r0
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vcmp.i16 ne, q2, zr
-; CHECK-NEXT:    vpsel q7, q0, q4
-; CHECK-NEXT:    vmov.u16 r0, q7[4]
+; CHECK-NEXT:    vpsel q4, q4, q0
+; CHECK-NEXT:    vmov.u16 r0, q4[4]
 ; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[5]
+; CHECK-NEXT:    vmov.u16 r0, q4[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[6]
+; CHECK-NEXT:    vmov.u16 r0, q4[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[7]
+; CHECK-NEXT:    vmov.u16 r0, q4[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[12]
-; CHECK-NEXT:    vmov.32 q5[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[13]
-; CHECK-NEXT:    vmov.32 q5[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[14]
-; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[15]
-; CHECK-NEXT:    vmov.32 q5[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[4]
+; CHECK-NEXT:    vmov.32 q2[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[5]
+; CHECK-NEXT:    vmov.32 q2[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[6]
+; CHECK-NEXT:    vmov.32 q2[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[7]
+; CHECK-NEXT:    vmov.32 q2[3], r0
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmovlb.s16 q4, q0
-; CHECK-NEXT:    vmovlb.s8 q5, q5
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    vmovlb.s16 q2, q5
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmovlb.s8 q2, q2
+; CHECK-NEXT:    vmovlb.s16 q5, q0
+; CHECK-NEXT:    vmovlb.s16 q0, q2
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov q6, q2
+; CHECK-NEXT:    vmov.u8 r0, q7[8]
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q0, q2, q4
-; CHECK-NEXT:    vmov.u8 r0, q6[0]
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmult.i32 q6, q0, q5
 ; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[1]
+; CHECK-NEXT:    vmov.u8 r0, q7[9]
+; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.16 q0[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[2]
+; CHECK-NEXT:    vmov.u8 r0, q7[10]
 ; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[3]
+; CHECK-NEXT:    vmov.u8 r0, q7[11]
 ; CHECK-NEXT:    vmov.16 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[4]
+; CHECK-NEXT:    vmov.u8 r0, q7[12]
 ; CHECK-NEXT:    vmov.16 q0[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[5]
+; CHECK-NEXT:    vmov.u8 r0, q7[13]
 ; CHECK-NEXT:    vmov.16 q0[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[6]
+; CHECK-NEXT:    vmov.u8 r0, q7[14]
 ; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q6[7]
+; CHECK-NEXT:    vmov.u8 r0, q7[15]
 ; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vcmp.i16 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vpsel q4, q2, q0
-; CHECK-NEXT:    vmov.u16 r0, q4[4]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vpsel q5, q5, q0
+; CHECK-NEXT:    vmov.u16 r0, q5[4]
 ; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[5]
+; CHECK-NEXT:    vmov.u16 r0, q5[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[6]
+; CHECK-NEXT:    vmov.u16 r0, q5[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[7]
+; CHECK-NEXT:    vmov.u16 r0, q5[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
 ; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
 ; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[4]
-; CHECK-NEXT:    vmov.32 q6[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[5]
-; CHECK-NEXT:    vmov.32 q6[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[6]
-; CHECK-NEXT:    vmov.32 q6[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[7]
-; CHECK-NEXT:    vmov.32 q6[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[12]
+; CHECK-NEXT:    vmov.32 q7[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[13]
+; CHECK-NEXT:    vmov.32 q7[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[14]
+; CHECK-NEXT:    vmov.32 q7[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[15]
+; CHECK-NEXT:    vmov.32 q7[3], r0
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmovlb.s8 q6, q6
+; CHECK-NEXT:    vmovlb.s8 q7, q7
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmovlb.s16 q6, q6
+; CHECK-NEXT:    vmovlb.s16 q7, q7
+; CHECK-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-NEXT:    vmul.i32 q0, q7, q0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q5, q6, q0
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.u16 r0, q7[0]
-; CHECK-NEXT:    vadd.i32 q6, q5, q0
+; CHECK-NEXT:    vaddt.i32 q6, q6, q0
 ; CHECK-NEXT:    vmov.32 q0[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[1]
+; CHECK-NEXT:    vmov.u16 r0, q4[1]
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[2]
+; CHECK-NEXT:    vmov.u16 r0, q4[2]
 ; CHECK-NEXT:    vmov.32 q0[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q7[3]
+; CHECK-NEXT:    vmov.u16 r0, q4[3]
+; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
+; CHECK-NEXT:    vcmp.i32 ne, q0, zr
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
+; CHECK-NEXT:    vmov.32 q0[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[0]
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[1]
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[2]
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q3[3]
+; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vmovlb.s8 q0, q0
+; CHECK-NEXT:    vmovlb.s8 q4, q4
+; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    vmov.u16 r0, q5[0]
+; CHECK-NEXT:    vmovlb.s16 q4, q4
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i32 q2, q4, q0
+; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[2]
+; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q5[3]
 ; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[8]
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmov.32 q0[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[9]
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.32 q0[1], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[10]
 ; CHECK-NEXT:    vmov.32 q0[2], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[11]
 ; CHECK-NEXT:    vmov.32 q0[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q3[8]
-; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmovlb.s16 q2, q0
-; CHECK-NEXT:    vmov.32 q0[0], r0
+; CHECK-NEXT:    vmov.32 q1[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q3[9]
-; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.32 q1[1], r0
 ; CHECK-NEXT:    vmov.u8 r0, q3[10]
-; CHECK-NEXT:    vmov.32 q0[2], r0
+; CHECK-NEXT:    vmov.32 q1[2], r0
 ; CHECK-NEXT:    vmov.u8 r0, q3[11]
-; CHECK-NEXT:    vmov.32 q0[3], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[0]
+; CHECK-NEXT:    vmov.32 q1[3], r0
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmovlb.s16 q5, q0
-; CHECK-NEXT:    vmov q0, q7
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q0, q5, q2
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[1]
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[2]
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.u16 r0, q4[3]
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[0]
-; CHECK-NEXT:    vmovlb.s8 q1, q2
-; CHECK-NEXT:    vmov.32 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[1]
+; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vmov.32 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[2]
-; CHECK-NEXT:    vmov.32 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q3[3]
-; CHECK-NEXT:    vmov.32 q2[3], r0
-; CHECK-NEXT:    vmov q3, q7
-; CHECK-NEXT:    vmovlb.s8 q2, q2
-; CHECK-NEXT:    vmovlb.s16 q2, q2
+; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q3, q2, q1
-; CHECK-NEXT:    vadd.i32 q0, q3, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, q6
+; CHECK-NEXT:    vaddt.i32 q2, q2, q0
+; CHECK-NEXT:    vadd.i32 q0, q2, q6
 ; CHECK-NEXT:    vaddv.u32 r0, q0
-; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1349,123 +1337,120 @@ entry:
 define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
 ; CHECK-LABEL: add_v16i8_v16i16_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vmov.u8 r0, q2[8]
+; CHECK-NEXT:    vmov.u8 r0, q2[0]
 ; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[9]
+; CHECK-NEXT:    vmov.u8 r0, q2[1]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[11]
+; CHECK-NEXT:    vmov.u8 r0, q2[3]
 ; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[12]
+; CHECK-NEXT:    vmov.u8 r0, q2[4]
 ; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.u8 r0, q2[5]
 ; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.u8 r0, q2[6]
 ; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[15]
+; CHECK-NEXT:    vmov.u8 r0, q2[7]
 ; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
 ; CHECK-NEXT:    vcmp.i16 ne, q3, zr
 ; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
 ; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
 ; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
 ; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
 ; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
 ; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmovlb.u8 q5, q3
+; CHECK-NEXT:    vmov.u8 r0, q0[0]
+; CHECK-NEXT:    vmovlb.u8 q4, q3
 ; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
 ; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
 ; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
 ; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
 ; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
 ; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[0]
-; CHECK-NEXT:    vmovlb.u8 q6, q3
+; CHECK-NEXT:    vmov.u8 r0, q2[8]
+; CHECK-NEXT:    vmovlb.u8 q5, q3
 ; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i16 q4, q6, q5
-; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[1]
-; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[2]
-; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[3]
-; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[4]
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[6]
-; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmovlb.u8 q1, q2
+; CHECK-NEXT:    vmult.i16 q3, q5, q4
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[9]
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[11]
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[12]
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[15]
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
 ; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
 ; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
 ; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
 ; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
 ; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
 ; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
 ; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
 ; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmovlb.u8 q0, q2
-; CHECK-NEXT:    vpt.i16 ne, q5, zr
-; CHECK-NEXT:    vmult.i16 q3, q0, q1
-; CHECK-NEXT:    vadd.i16 q0, q3, q4
-; CHECK-NEXT:    vaddv.u16 r0, q0
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.16 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.16 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vmullb.u8 q0, q1, q2
+; CHECK-NEXT:    vpt.i16 ne, q4, zr
+; CHECK-NEXT:    vaddt.i16 q3, q3, q0
+; CHECK-NEXT:    vaddv.u16 r0, q3
 ; CHECK-NEXT:    uxth r0, r0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
@@ -1480,123 +1465,120 @@ entry:
 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
 ; CHECK-LABEL: add_v16i8_v16i16_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vmov.u8 r0, q2[8]
+; CHECK-NEXT:    vmov.u8 r0, q2[0]
 ; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[9]
+; CHECK-NEXT:    vmov.u8 r0, q2[1]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[11]
+; CHECK-NEXT:    vmov.u8 r0, q2[3]
 ; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[12]
+; CHECK-NEXT:    vmov.u8 r0, q2[4]
 ; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.u8 r0, q2[5]
 ; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.u8 r0, q2[6]
 ; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[15]
+; CHECK-NEXT:    vmov.u8 r0, q2[7]
 ; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[8]
+; CHECK-NEXT:    vmov.u8 r0, q1[0]
 ; CHECK-NEXT:    vcmp.i16 ne, q3, zr
 ; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[9]
+; CHECK-NEXT:    vmov.u8 r0, q1[1]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[10]
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[11]
+; CHECK-NEXT:    vmov.u8 r0, q1[3]
 ; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[12]
+; CHECK-NEXT:    vmov.u8 r0, q1[4]
 ; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[13]
+; CHECK-NEXT:    vmov.u8 r0, q1[5]
 ; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[14]
+; CHECK-NEXT:    vmov.u8 r0, q1[6]
 ; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[15]
+; CHECK-NEXT:    vmov.u8 r0, q1[7]
 ; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmovlb.s8 q5, q3
+; CHECK-NEXT:    vmov.u8 r0, q0[0]
+; CHECK-NEXT:    vmovlb.s8 q4, q3
 ; CHECK-NEXT:    vmov.16 q3[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.u8 r0, q0[1]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.u8 r0, q0[3]
 ; CHECK-NEXT:    vmov.16 q3[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vmov.16 q3[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
-; CHECK-NEXT:    vmov.16 q3[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
-; CHECK-NEXT:    vmov.16 q3[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
-; CHECK-NEXT:    vmov.16 q3[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[0]
-; CHECK-NEXT:    vmovlb.s8 q6, q3
-; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    vmov q4, q3
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i16 q4, q6, q5
-; CHECK-NEXT:    vmov.16 q5[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[1]
-; CHECK-NEXT:    vmov.16 q5[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[2]
-; CHECK-NEXT:    vmov.16 q5[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[3]
-; CHECK-NEXT:    vmov.16 q5[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[4]
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[5]
-; CHECK-NEXT:    vmov.16 q5[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[6]
-; CHECK-NEXT:    vmov.16 q5[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[7]
-; CHECK-NEXT:    vmov.16 q5[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[3]
-; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[4]
-; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[5]
-; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[6]
-; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[7]
-; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmovlb.s8 q1, q2
+; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[5]
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[8]
+; CHECK-NEXT:    vmovlb.s8 q5, q3
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i16 q3, q5, q4
+; CHECK-NEXT:    vmov.16 q4[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[9]
+; CHECK-NEXT:    vmov.16 q4[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
+; CHECK-NEXT:    vmov.16 q4[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[11]
+; CHECK-NEXT:    vmov.16 q4[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[12]
+; CHECK-NEXT:    vmov.16 q4[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[13]
+; CHECK-NEXT:    vmov.16 q4[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[14]
+; CHECK-NEXT:    vmov.16 q4[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[15]
+; CHECK-NEXT:    vmov.16 q4[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q1[8]
 ; CHECK-NEXT:    vmov.16 q2[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
+; CHECK-NEXT:    vmov.u8 r0, q1[9]
 ; CHECK-NEXT:    vmov.16 q2[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
+; CHECK-NEXT:    vmov.u8 r0, q1[10]
 ; CHECK-NEXT:    vmov.16 q2[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
+; CHECK-NEXT:    vmov.u8 r0, q1[11]
 ; CHECK-NEXT:    vmov.16 q2[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
+; CHECK-NEXT:    vmov.u8 r0, q1[12]
 ; CHECK-NEXT:    vmov.16 q2[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
+; CHECK-NEXT:    vmov.u8 r0, q1[13]
 ; CHECK-NEXT:    vmov.16 q2[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
+; CHECK-NEXT:    vmov.u8 r0, q1[14]
 ; CHECK-NEXT:    vmov.16 q2[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
+; CHECK-NEXT:    vmov.u8 r0, q1[15]
 ; CHECK-NEXT:    vmov.16 q2[7], r0
-; CHECK-NEXT:    vmovlb.s8 q0, q2
-; CHECK-NEXT:    vpt.i16 ne, q5, zr
-; CHECK-NEXT:    vmult.i16 q3, q0, q1
-; CHECK-NEXT:    vadd.i16 q0, q3, q4
-; CHECK-NEXT:    vaddv.u16 r0, q0
+; CHECK-NEXT:    vmov.u8 r0, q0[8]
+; CHECK-NEXT:    vmov.16 q1[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.16 q1[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.16 q1[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.16 q1[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.16 q1[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.16 q1[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.16 q1[7], r0
+; CHECK-NEXT:    vmullb.s8 q0, q1, q2
+; CHECK-NEXT:    vpt.i16 ne, q4, zr
+; CHECK-NEXT:    vaddt.i16 q3, q3, q0
+; CHECK-NEXT:    vaddv.u16 r0, q3
 ; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
@@ -2825,74 +2807,71 @@ entry:
 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) {
 ; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov.i8 q3, #0x0
 ; CHECK-NEXT:    vmov.i8 q4, #0xff
 ; CHECK-NEXT:    vcmp.i16 eq, q2, zr
 ; CHECK-NEXT:    vpsel q2, q4, q3
-; CHECK-NEXT:    vmov.u16 r1, q2[4]
+; CHECK-NEXT:    vmov.u16 r1, q2[0]
 ; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[5]
+; CHECK-NEXT:    vmov.u16 r1, q2[1]
 ; CHECK-NEXT:    vmov.32 q3[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[6]
+; CHECK-NEXT:    vmov.u16 r1, q2[2]
 ; CHECK-NEXT:    vmov.32 q3[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[7]
+; CHECK-NEXT:    vmov.u16 r1, q2[3]
 ; CHECK-NEXT:    vmov.32 q3[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
 ; CHECK-NEXT:    vmov.32 q3[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[6]
+; CHECK-NEXT:    vmov.u16 r1, q1[2]
 ; CHECK-NEXT:    vmov.32 q3[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[7]
+; CHECK-NEXT:    vmov.u16 r1, q1[3]
 ; CHECK-NEXT:    vmov.32 q3[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmovlb.u16 q5, q3
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmovlb.u16 q4, q3
 ; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
 ; CHECK-NEXT:    vmov.32 q3[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
 ; CHECK-NEXT:    vmov.32 q3[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
 ; CHECK-NEXT:    vmov.32 q3[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[0]
-; CHECK-NEXT:    vmovlb.u16 q6, q3
+; CHECK-NEXT:    vmov.u16 r1, q2[4]
+; CHECK-NEXT:    vmovlb.u16 q5, q3
 ; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q4, q6, q5
-; CHECK-NEXT:    vmov.32 q5[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[1]
-; CHECK-NEXT:    vmov.32 q5[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[2]
-; CHECK-NEXT:    vmov.32 q5[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[3]
-; CHECK-NEXT:    vmov.32 q5[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmovlb.u16 q1, q2
+; CHECK-NEXT:    vmult.i32 q3, q5, q4
+; CHECK-NEXT:    vmov.32 q4[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q2[5]
+; CHECK-NEXT:    vmov.32 q4[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q2[6]
+; CHECK-NEXT:    vmov.32 q4[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q2[7]
+; CHECK-NEXT:    vmov.32 q4[3], r1
+; CHECK-NEXT:    vmov.u16 r1, q1[4]
 ; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov.u16 r1, q1[5]
 ; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r1, q1[6]
 ; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov.u16 r1, q1[7]
 ; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmovlb.u16 q0, q2
-; CHECK-NEXT:    vpt.i32 ne, q5, zr
-; CHECK-NEXT:    vmult.i32 q3, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q3, q4
-; CHECK-NEXT:    vaddva.u32 r0, q0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmullb.u16 q0, q1, q2
+; CHECK-NEXT:    vpt.i32 ne, q4, zr
+; CHECK-NEXT:    vaddt.i32 q3, q3, q0
+; CHECK-NEXT:    vaddva.u32 r0, q3
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
@@ -2908,74 +2887,71 @@ entry:
 define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) {
 ; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov.i8 q3, #0x0
 ; CHECK-NEXT:    vmov.i8 q4, #0xff
 ; CHECK-NEXT:    vcmp.i16 eq, q2, zr
 ; CHECK-NEXT:    vpsel q2, q4, q3
-; CHECK-NEXT:    vmov.u16 r1, q2[4]
+; CHECK-NEXT:    vmov.u16 r1, q2[0]
 ; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[5]
+; CHECK-NEXT:    vmov.u16 r1, q2[1]
 ; CHECK-NEXT:    vmov.32 q3[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[6]
+; CHECK-NEXT:    vmov.u16 r1, q2[2]
 ; CHECK-NEXT:    vmov.32 q3[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[7]
+; CHECK-NEXT:    vmov.u16 r1, q2[3]
 ; CHECK-NEXT:    vmov.32 q3[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[4]
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
 ; CHECK-NEXT:    vcmp.i32 ne, q3, zr
 ; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
 ; CHECK-NEXT:    vmov.32 q3[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[6]
+; CHECK-NEXT:    vmov.u16 r1, q1[2]
 ; CHECK-NEXT:    vmov.32 q3[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[7]
+; CHECK-NEXT:    vmov.u16 r1, q1[3]
 ; CHECK-NEXT:    vmov.32 q3[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[4]
-; CHECK-NEXT:    vmovlb.s16 q5, q3
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmovlb.s16 q4, q3
 ; CHECK-NEXT:    vmov.32 q3[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
 ; CHECK-NEXT:    vmov.32 q3[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
 ; CHECK-NEXT:    vmov.32 q3[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
 ; CHECK-NEXT:    vmov.32 q3[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[0]
-; CHECK-NEXT:    vmovlb.s16 q6, q3
+; CHECK-NEXT:    vmov.u16 r1, q2[4]
+; CHECK-NEXT:    vmovlb.s16 q5, q3
 ; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q4, q6, q5
-; CHECK-NEXT:    vmov.32 q5[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[1]
-; CHECK-NEXT:    vmov.32 q5[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[2]
-; CHECK-NEXT:    vmov.32 q5[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q2[3]
-; CHECK-NEXT:    vmov.32 q5[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[0]
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    vmovlb.s16 q1, q2
+; CHECK-NEXT:    vmult.i32 q3, q5, q4
+; CHECK-NEXT:    vmov.32 q4[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q2[5]
+; CHECK-NEXT:    vmov.32 q4[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q2[6]
+; CHECK-NEXT:    vmov.32 q4[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q2[7]
+; CHECK-NEXT:    vmov.32 q4[3], r1
+; CHECK-NEXT:    vmov.u16 r1, q1[4]
 ; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov.u16 r1, q1[5]
 ; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r1, q1[6]
 ; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov.u16 r1, q1[7]
 ; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmovlb.s16 q0, q2
-; CHECK-NEXT:    vpt.i32 ne, q5, zr
-; CHECK-NEXT:    vmult.i32 q3, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q3, q4
-; CHECK-NEXT:    vaddva.u32 r0, q0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmullb.s16 q0, q1, q2
+; CHECK-NEXT:    vpt.i32 ne, q4, zr
+; CHECK-NEXT:    vaddt.i32 q3, q3, q0
+; CHECK-NEXT:    vaddva.u32 r0, q3
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <8 x i16> %b, zeroinitializer
@@ -3533,86 +3509,36 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #64
 ; CHECK-NEXT:    sub sp, #64
-; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vcmp.i8 eq, q2, zr
-; CHECK-NEXT:    vmov.i8 q2, #0xff
-; CHECK-NEXT:    vmov.i8 q0, #0x0
-; CHECK-NEXT:    vpsel q7, q2, q0
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov.u8 r1, q7[8]
-; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[9]
-; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[10]
-; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[11]
-; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[12]
-; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[13]
-; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[14]
-; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[15]
-; CHECK-NEXT:    vmov.16 q2[7], r1
-; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vcmp.i16 ne, q2, zr
-; CHECK-NEXT:    vmov.i32 q2, #0xff
-; CHECK-NEXT:    vpsel q4, q4, q0
-; CHECK-NEXT:    vmov.u16 r1, q4[4]
-; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[5]
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[6]
-; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[7]
-; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[12]
-; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[13]
-; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[14]
-; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[15]
-; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[12]
-; CHECK-NEXT:    vmov.32 q6[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[13]
-; CHECK-NEXT:    vmov.32 q6[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[14]
-; CHECK-NEXT:    vmov.32 q6[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[15]
-; CHECK-NEXT:    vmov.32 q6[3], r1
-; CHECK-NEXT:    vand q5, q0, q2
-; CHECK-NEXT:    vand q0, q6, q2
-; CHECK-NEXT:    vmov.i32 q6, #0x0
-; CHECK-NEXT:    vmov.u8 r1, q7[0]
-; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q6, q0, q5
-; CHECK-NEXT:    vmov.16 q0[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[1]
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.16 q0[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[2]
-; CHECK-NEXT:    vmov.16 q0[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[3]
-; CHECK-NEXT:    vmov.16 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[4]
-; CHECK-NEXT:    vmov.16 q0[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[5]
-; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[6]
-; CHECK-NEXT:    vmov.16 q0[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q7[7]
-; CHECK-NEXT:    vmov.16 q0[7], r1
-; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vcmp.i16 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vcmp.i8 eq, q3, zr
+; CHECK-NEXT:    vmov.i8 q3, #0xff
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vpsel q1, q3, q0
+; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    vmov.u8 r1, q1[0]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.16 q3[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[1]
+; CHECK-NEXT:    vmov.16 q3[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[2]
+; CHECK-NEXT:    vmov.16 q3[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[3]
+; CHECK-NEXT:    vmov.16 q3[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[4]
+; CHECK-NEXT:    vmov.16 q3[4], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[5]
+; CHECK-NEXT:    vmov.16 q3[5], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[6]
+; CHECK-NEXT:    vmov.16 q3[6], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[7]
+; CHECK-NEXT:    vmov.16 q3[7], r1
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vcmp.i16 ne, q3, zr
+; CHECK-NEXT:    vmov.i32 q3, #0xff
 ; CHECK-NEXT:    vpsel q5, q5, q0
+; CHECK-NEXT:    vstrw.32 q3, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.u16 r1, q5[4]
 ; CHECK-NEXT:    vmov.32 q0[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q5[5]
@@ -3621,92 +3547,142 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.32 q0[2], r1
 ; CHECK-NEXT:    vmov.u16 r1, q5[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[4]
+; CHECK-NEXT:    vmov.u8 r1, q2[4]
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[5]
+; CHECK-NEXT:    vmov.u8 r1, q2[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[6]
+; CHECK-NEXT:    vmov.u8 r1, q2[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[7]
+; CHECK-NEXT:    vmov.u8 r1, q2[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[4]
-; CHECK-NEXT:    vmov.32 q7[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[5]
-; CHECK-NEXT:    vmov.32 q7[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[6]
-; CHECK-NEXT:    vmov.32 q7[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[7]
-; CHECK-NEXT:    vmov.32 q7[3], r1
-; CHECK-NEXT:    vand q0, q0, q2
-; CHECK-NEXT:    vand q7, q7, q2
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q6, q7, q0
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.u16 r1, q4[0]
-; CHECK-NEXT:    vadd.i32 q0, q6, q0
+; CHECK-NEXT:    vmov.u8 r1, q4[4]
 ; CHECK-NEXT:    vmov.32 q6[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[1]
-; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.u8 r1, q4[5]
 ; CHECK-NEXT:    vmov.32 q6[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[2]
+; CHECK-NEXT:    vmov.u8 r1, q4[6]
 ; CHECK-NEXT:    vmov.32 q6[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[3]
+; CHECK-NEXT:    vmov.u8 r1, q4[7]
 ; CHECK-NEXT:    vmov.32 q6[3], r1
 ; CHECK-NEXT:    vmov.u8 r1, q1[8]
-; CHECK-NEXT:    vmov.32 q4[0], r1
+; CHECK-NEXT:    vand q7, q0, q3
+; CHECK-NEXT:    vmov.16 q0[0], r1
 ; CHECK-NEXT:    vmov.u8 r1, q1[9]
-; CHECK-NEXT:    vmov.32 q4[1], r1
+; CHECK-NEXT:    vand q3, q6, q3
+; CHECK-NEXT:    vmov.16 q0[1], r1
 ; CHECK-NEXT:    vmov.u8 r1, q1[10]
-; CHECK-NEXT:    vmov.32 q4[2], r1
+; CHECK-NEXT:    vmov.16 q0[2], r1
 ; CHECK-NEXT:    vmov.u8 r1, q1[11]
-; CHECK-NEXT:    vmov.32 q4[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[8]
-; CHECK-NEXT:    vcmp.i32 ne, q6, zr
-; CHECK-NEXT:    vand q6, q4, q2
-; CHECK-NEXT:    vmov.32 q4[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[9]
-; CHECK-NEXT:    vmov.32 q4[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[10]
-; CHECK-NEXT:    vmov.32 q4[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[11]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.32 q4[3], r1
-; CHECK-NEXT:    vand q7, q4, q2
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[12]
+; CHECK-NEXT:    vmov.16 q0[4], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[13]
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[14]
+; CHECK-NEXT:    vmov.16 q0[6], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[15]
+; CHECK-NEXT:    vmov.i32 q6, #0x0
+; CHECK-NEXT:    vmov.16 q0[7], r1
+; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i32 q6, q3, q7
+; CHECK-NEXT:    vcmp.i16 ne, q0, zr
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[12]
+; CHECK-NEXT:    vcmp.i32 ne, q1, zr
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[13]
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[14]
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[15]
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q4[12]
+; CHECK-NEXT:    vmov.32 q7[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q4[13]
+; CHECK-NEXT:    vmov.32 q7[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q4[14]
+; CHECK-NEXT:    vmov.32 q7[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q4[15]
+; CHECK-NEXT:    vmov.32 q7[3], r1
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vand q7, q7, q3
 ; CHECK-NEXT:    vmov.u16 r1, q5[0]
-; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmul.i32 q1, q7, q1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q4, q7, q6
-; CHECK-NEXT:    vmov.32 q6[0], r1
+; CHECK-NEXT:    vaddt.i32 q6, q6, q1
+; CHECK-NEXT:    vmov.32 q1[0], r1
 ; CHECK-NEXT:    vmov.u16 r1, q5[1]
-; CHECK-NEXT:    vmov.32 q6[1], r1
+; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    vmov.u16 r1, q5[2]
-; CHECK-NEXT:    vmov.32 q6[2], r1
+; CHECK-NEXT:    vmov.32 q1[2], r1
 ; CHECK-NEXT:    vmov.u16 r1, q5[3]
-; CHECK-NEXT:    vmov.32 q6[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[0]
-; CHECK-NEXT:    vmov.32 q5[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[1]
-; CHECK-NEXT:    vmov.32 q5[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[2]
-; CHECK-NEXT:    vmov.32 q5[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[3]
-; CHECK-NEXT:    vmov.32 q5[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[0]
-; CHECK-NEXT:    vand q1, q5, q2
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[0]
+; CHECK-NEXT:    vcmp.i32 ne, q1, zr
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[1]
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[2]
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[3]
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q4[0]
 ; CHECK-NEXT:    vmov.32 q5[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[1]
+; CHECK-NEXT:    vmov.u8 r1, q4[1]
 ; CHECK-NEXT:    vmov.32 q5[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[2]
+; CHECK-NEXT:    vmov.u8 r1, q4[2]
 ; CHECK-NEXT:    vmov.32 q5[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[3]
+; CHECK-NEXT:    vmov.u8 r1, q4[3]
 ; CHECK-NEXT:    vmov.32 q5[3], r1
-; CHECK-NEXT:    vand q2, q5, q2
-; CHECK-NEXT:    vpt.i32 ne, q6, zr
-; CHECK-NEXT:    vmult.i32 q0, q2, q1
-; CHECK-NEXT:    vadd.i32 q1, q0, q4
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q0, q1, q0
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vand q5, q5, q3
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i32 q7, q5, q1
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[8]
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[9]
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[10]
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[11]
+; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q4[8]
+; CHECK-NEXT:    vcmp.i32 ne, q1, zr
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q4[9]
+; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vmov.32 q1[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q4[10]
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q4[11]
+; CHECK-NEXT:    vmov.32 q1[3], r1
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vmul.i32 q0, q1, q0
+; CHECK-NEXT:    vmov q1, q7
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i32 q1, q7, q0
+; CHECK-NEXT:    vadd.i32 q0, q1, q6
 ; CHECK-NEXT:    vaddva.u32 r0, q0
 ; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
@@ -3727,191 +3703,185 @@ define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #64
-; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
-; CHECK-NEXT:    vmov.i8 q2, #0x0
-; CHECK-NEXT:    vmov.i8 q0, #0xff
-; CHECK-NEXT:    vpsel q6, q0, q2
+; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q7, q2, q0
 ; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov.u8 r1, q6[8]
-; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.u8 r1, q7[0]
+; CHECK-NEXT:    vstrw.32 q2, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[9]
+; CHECK-NEXT:    vmov.u8 r1, q7[1]
 ; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[10]
+; CHECK-NEXT:    vmov.u8 r1, q7[2]
 ; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[11]
+; CHECK-NEXT:    vmov.u8 r1, q7[3]
 ; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[12]
+; CHECK-NEXT:    vmov.u8 r1, q7[4]
 ; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[13]
+; CHECK-NEXT:    vmov.u8 r1, q7[5]
 ; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[14]
+; CHECK-NEXT:    vmov.u8 r1, q7[6]
 ; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[15]
+; CHECK-NEXT:    vmov.u8 r1, q7[7]
 ; CHECK-NEXT:    vmov.16 q2[7], r1
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vcmp.i16 ne, q2, zr
-; CHECK-NEXT:    vpsel q7, q0, q4
-; CHECK-NEXT:    vmov.u16 r1, q7[4]
+; CHECK-NEXT:    vpsel q4, q4, q0
+; CHECK-NEXT:    vmov.u16 r1, q4[4]
 ; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q7[5]
+; CHECK-NEXT:    vmov.u16 r1, q4[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q7[6]
+; CHECK-NEXT:    vmov.u16 r1, q4[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q7[7]
+; CHECK-NEXT:    vmov.u16 r1, q4[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[12]
+; CHECK-NEXT:    vmov.u8 r1, q1[4]
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[13]
+; CHECK-NEXT:    vmov.u8 r1, q1[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[14]
+; CHECK-NEXT:    vmov.u8 r1, q1[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[15]
+; CHECK-NEXT:    vmov.u8 r1, q1[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[12]
-; CHECK-NEXT:    vmov.32 q5[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[13]
-; CHECK-NEXT:    vmov.32 q5[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[14]
-; CHECK-NEXT:    vmov.32 q5[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[15]
-; CHECK-NEXT:    vmov.32 q5[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[4]
+; CHECK-NEXT:    vmov.32 q2[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[5]
+; CHECK-NEXT:    vmov.32 q2[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[6]
+; CHECK-NEXT:    vmov.32 q2[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[7]
+; CHECK-NEXT:    vmov.32 q2[3], r1
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmovlb.s16 q4, q0
-; CHECK-NEXT:    vmovlb.s8 q5, q5
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    vmovlb.s16 q2, q5
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmovlb.s8 q2, q2
+; CHECK-NEXT:    vmovlb.s16 q5, q0
+; CHECK-NEXT:    vmovlb.s16 q0, q2
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov q6, q2
+; CHECK-NEXT:    vmov.u8 r1, q7[8]
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q0, q2, q4
-; CHECK-NEXT:    vmov.u8 r1, q6[0]
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmult.i32 q6, q0, q5
 ; CHECK-NEXT:    vmov.16 q0[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[1]
+; CHECK-NEXT:    vmov.u8 r1, q7[9]
+; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.16 q0[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[2]
+; CHECK-NEXT:    vmov.u8 r1, q7[10]
 ; CHECK-NEXT:    vmov.16 q0[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[3]
+; CHECK-NEXT:    vmov.u8 r1, q7[11]
 ; CHECK-NEXT:    vmov.16 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[4]
+; CHECK-NEXT:    vmov.u8 r1, q7[12]
 ; CHECK-NEXT:    vmov.16 q0[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[5]
+; CHECK-NEXT:    vmov.u8 r1, q7[13]
 ; CHECK-NEXT:    vmov.16 q0[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[6]
+; CHECK-NEXT:    vmov.u8 r1, q7[14]
 ; CHECK-NEXT:    vmov.16 q0[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q6[7]
+; CHECK-NEXT:    vmov.u8 r1, q7[15]
 ; CHECK-NEXT:    vmov.16 q0[7], r1
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vcmp.i16 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vpsel q4, q2, q0
-; CHECK-NEXT:    vmov.u16 r1, q4[4]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vpsel q5, q5, q0
+; CHECK-NEXT:    vmov.u16 r1, q5[4]
 ; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[5]
+; CHECK-NEXT:    vmov.u16 r1, q5[5]
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[6]
+; CHECK-NEXT:    vmov.u16 r1, q5[6]
 ; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[7]
+; CHECK-NEXT:    vmov.u16 r1, q5[7]
 ; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[4]
+; CHECK-NEXT:    vmov.u8 r1, q1[12]
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[5]
+; CHECK-NEXT:    vmov.u8 r1, q1[13]
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[6]
+; CHECK-NEXT:    vmov.u8 r1, q1[14]
 ; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[7]
+; CHECK-NEXT:    vmov.u8 r1, q1[15]
 ; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[4]
-; CHECK-NEXT:    vmov.32 q6[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[5]
-; CHECK-NEXT:    vmov.32 q6[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[6]
-; CHECK-NEXT:    vmov.32 q6[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[7]
-; CHECK-NEXT:    vmov.32 q6[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[12]
+; CHECK-NEXT:    vmov.32 q7[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[13]
+; CHECK-NEXT:    vmov.32 q7[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[14]
+; CHECK-NEXT:    vmov.32 q7[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[15]
+; CHECK-NEXT:    vmov.32 q7[3], r1
+; CHECK-NEXT:    vmovlb.s8 q0, q0
+; CHECK-NEXT:    vmovlb.s8 q7, q7
+; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    vmovlb.s16 q7, q7
+; CHECK-NEXT:    vmov.u16 r1, q4[0]
+; CHECK-NEXT:    vmul.i32 q0, q7, q0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i32 q6, q6, q0
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q4[1]
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q4[2]
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q4[3]
+; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[0]
+; CHECK-NEXT:    vcmp.i32 ne, q0, zr
+; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[1]
+; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[2]
+; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[3]
+; CHECK-NEXT:    vmov.32 q0[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[0]
+; CHECK-NEXT:    vmov.32 q4[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[1]
+; CHECK-NEXT:    vmov.32 q4[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[2]
+; CHECK-NEXT:    vmov.32 q4[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q3[3]
+; CHECK-NEXT:    vmov.32 q4[3], r1
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmovlb.s8 q6, q6
+; CHECK-NEXT:    vmovlb.s8 q4, q4
 ; CHECK-NEXT:    vmovlb.s16 q0, q0
-; CHECK-NEXT:    vmovlb.s16 q6, q6
+; CHECK-NEXT:    vmov.u16 r1, q5[0]
+; CHECK-NEXT:    vmovlb.s16 q4, q4
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q5, q6, q0
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.u16 r1, q7[0]
-; CHECK-NEXT:    vadd.i32 q6, q5, q0
+; CHECK-NEXT:    vmult.i32 q2, q4, q0
 ; CHECK-NEXT:    vmov.32 q0[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q7[1]
+; CHECK-NEXT:    vmov.u16 r1, q5[1]
 ; CHECK-NEXT:    vmov.32 q0[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q7[2]
+; CHECK-NEXT:    vmov.u16 r1, q5[2]
 ; CHECK-NEXT:    vmov.32 q0[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q7[3]
+; CHECK-NEXT:    vmov.u16 r1, q5[3]
 ; CHECK-NEXT:    vmov.32 q0[3], r1
 ; CHECK-NEXT:    vmov.u8 r1, q1[8]
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vmov.32 q0[0], r1
 ; CHECK-NEXT:    vmov.u8 r1, q1[9]
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.32 q0[1], r1
 ; CHECK-NEXT:    vmov.u8 r1, q1[10]
 ; CHECK-NEXT:    vmov.32 q0[2], r1
 ; CHECK-NEXT:    vmov.u8 r1, q1[11]
 ; CHECK-NEXT:    vmov.32 q0[3], r1
 ; CHECK-NEXT:    vmov.u8 r1, q3[8]
-; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmovlb.s16 q2, q0
-; CHECK-NEXT:    vmov.32 q0[0], r1
+; CHECK-NEXT:    vmov.32 q1[0], r1
 ; CHECK-NEXT:    vmov.u8 r1, q3[9]
-; CHECK-NEXT:    vmov.32 q0[1], r1
+; CHECK-NEXT:    vmov.32 q1[1], r1
 ; CHECK-NEXT:    vmov.u8 r1, q3[10]
-; CHECK-NEXT:    vmov.32 q0[2], r1
+; CHECK-NEXT:    vmov.32 q1[2], r1
 ; CHECK-NEXT:    vmov.u8 r1, q3[11]
-; CHECK-NEXT:    vmov.32 q0[3], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[0]
+; CHECK-NEXT:    vmov.32 q1[3], r1
 ; CHECK-NEXT:    vmovlb.s8 q0, q0
-; CHECK-NEXT:    vmovlb.s16 q5, q0
-; CHECK-NEXT:    vmov q0, q7
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q0, q5, q2
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[1]
-; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[2]
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u16 r1, q4[3]
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[0]
-; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[1]
-; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[2]
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[3]
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[0]
-; CHECK-NEXT:    vmovlb.s8 q1, q2
-; CHECK-NEXT:    vmov.32 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[1]
+; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmovlb.s16 q0, q0
 ; CHECK-NEXT:    vmovlb.s16 q1, q1
-; CHECK-NEXT:    vmov.32 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[2]
-; CHECK-NEXT:    vmov.32 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q3[3]
-; CHECK-NEXT:    vmov.32 q2[3], r1
-; CHECK-NEXT:    vmov q3, q7
-; CHECK-NEXT:    vmovlb.s8 q2, q2
-; CHECK-NEXT:    vmovlb.s16 q2, q2
+; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i32 q3, q2, q1
-; CHECK-NEXT:    vadd.i32 q0, q3, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, q6
+; CHECK-NEXT:    vaddt.i32 q2, q2, q0
+; CHECK-NEXT:    vadd.i32 q0, q2, q6
 ; CHECK-NEXT:    vaddva.u32 r0, q0
-; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -3978,123 +3948,120 @@ entry:
 define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) {
 ; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vmov.u8 r1, q2[8]
+; CHECK-NEXT:    vmov.u8 r1, q2[0]
 ; CHECK-NEXT:    vmov.16 q3[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[9]
+; CHECK-NEXT:    vmov.u8 r1, q2[1]
 ; CHECK-NEXT:    vmov.16 q3[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[10]
+; CHECK-NEXT:    vmov.u8 r1, q2[2]
 ; CHECK-NEXT:    vmov.16 q3[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[11]
+; CHECK-NEXT:    vmov.u8 r1, q2[3]
 ; CHECK-NEXT:    vmov.16 q3[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[12]
+; CHECK-NEXT:    vmov.u8 r1, q2[4]
 ; CHECK-NEXT:    vmov.16 q3[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[13]
+; CHECK-NEXT:    vmov.u8 r1, q2[5]
 ; CHECK-NEXT:    vmov.16 q3[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[14]
+; CHECK-NEXT:    vmov.u8 r1, q2[6]
 ; CHECK-NEXT:    vmov.16 q3[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[15]
+; CHECK-NEXT:    vmov.u8 r1, q2[7]
 ; CHECK-NEXT:    vmov.16 q3[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[8]
+; CHECK-NEXT:    vmov.u8 r1, q1[0]
 ; CHECK-NEXT:    vcmp.i16 ne, q3, zr
 ; CHECK-NEXT:    vmov.16 q3[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[9]
+; CHECK-NEXT:    vmov.u8 r1, q1[1]
 ; CHECK-NEXT:    vmov.16 q3[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[10]
+; CHECK-NEXT:    vmov.u8 r1, q1[2]
 ; CHECK-NEXT:    vmov.16 q3[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[11]
+; CHECK-NEXT:    vmov.u8 r1, q1[3]
 ; CHECK-NEXT:    vmov.16 q3[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[12]
+; CHECK-NEXT:    vmov.u8 r1, q1[4]
 ; CHECK-NEXT:    vmov.16 q3[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[13]
+; CHECK-NEXT:    vmov.u8 r1, q1[5]
 ; CHECK-NEXT:    vmov.16 q3[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[14]
+; CHECK-NEXT:    vmov.u8 r1, q1[6]
 ; CHECK-NEXT:    vmov.16 q3[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[15]
+; CHECK-NEXT:    vmov.u8 r1, q1[7]
 ; CHECK-NEXT:    vmov.16 q3[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    vmovlb.u8 q5, q3
+; CHECK-NEXT:    vmov.u8 r1, q0[0]
+; CHECK-NEXT:    vmovlb.u8 q4, q3
 ; CHECK-NEXT:    vmov.16 q3[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
 ; CHECK-NEXT:    vmov.16 q3[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
+; CHECK-NEXT:    vmov.u8 r1, q0[2]
 ; CHECK-NEXT:    vmov.16 q3[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
+; CHECK-NEXT:    vmov.u8 r1, q0[3]
 ; CHECK-NEXT:    vmov.16 q3[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
 ; CHECK-NEXT:    vmov.16 q3[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    vmov.u8 r1, q0[5]
 ; CHECK-NEXT:    vmov.16 q3[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
+; CHECK-NEXT:    vmov.u8 r1, q0[6]
 ; CHECK-NEXT:    vmov.16 q3[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[7]
 ; CHECK-NEXT:    vmov.16 q3[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[0]
-; CHECK-NEXT:    vmovlb.u8 q6, q3
+; CHECK-NEXT:    vmov.u8 r1, q2[8]
+; CHECK-NEXT:    vmovlb.u8 q5, q3
 ; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i16 q4, q6, q5
-; CHECK-NEXT:    vmov.16 q5[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[1]
-; CHECK-NEXT:    vmov.16 q5[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[2]
-; CHECK-NEXT:    vmov.16 q5[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[3]
-; CHECK-NEXT:    vmov.16 q5[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[4]
-; CHECK-NEXT:    vmov.16 q5[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[5]
-; CHECK-NEXT:    vmov.16 q5[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[6]
-; CHECK-NEXT:    vmov.16 q5[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[7]
-; CHECK-NEXT:    vmov.16 q5[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[0]
-; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[1]
-; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[2]
-; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[3]
-; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[4]
-; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[5]
-; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[6]
-; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[7]
-; CHECK-NEXT:    vmov.16 q2[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmovlb.u8 q1, q2
+; CHECK-NEXT:    vmult.i16 q3, q5, q4
+; CHECK-NEXT:    vmov.16 q4[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[9]
+; CHECK-NEXT:    vmov.16 q4[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[10]
+; CHECK-NEXT:    vmov.16 q4[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[11]
+; CHECK-NEXT:    vmov.16 q4[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[12]
+; CHECK-NEXT:    vmov.16 q4[4], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[13]
+; CHECK-NEXT:    vmov.16 q4[5], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[14]
+; CHECK-NEXT:    vmov.16 q4[6], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[15]
+; CHECK-NEXT:    vmov.16 q4[7], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[8]
 ; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov.u8 r1, q1[9]
 ; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
+; CHECK-NEXT:    vmov.u8 r1, q1[10]
 ; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
+; CHECK-NEXT:    vmov.u8 r1, q1[11]
 ; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.u8 r1, q1[12]
 ; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
+; CHECK-NEXT:    vmov.u8 r1, q1[13]
 ; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
+; CHECK-NEXT:    vmov.u8 r1, q1[14]
 ; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
+; CHECK-NEXT:    vmov.u8 r1, q1[15]
 ; CHECK-NEXT:    vmov.16 q2[7], r1
-; CHECK-NEXT:    vmovlb.u8 q0, q2
-; CHECK-NEXT:    vpt.i16 ne, q5, zr
-; CHECK-NEXT:    vmult.i16 q3, q0, q1
-; CHECK-NEXT:    vadd.i16 q0, q3, q4
-; CHECK-NEXT:    vaddva.u16 r0, q0
+; CHECK-NEXT:    vmov.u8 r1, q0[8]
+; CHECK-NEXT:    vmov.16 q1[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    vmov.16 q1[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[10]
+; CHECK-NEXT:    vmov.16 q1[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[11]
+; CHECK-NEXT:    vmov.16 q1[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
+; CHECK-NEXT:    vmov.16 q1[4], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    vmov.16 q1[5], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[14]
+; CHECK-NEXT:    vmov.16 q1[6], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.16 q1[7], r1
+; CHECK-NEXT:    vmullb.u8 q0, q1, q2
+; CHECK-NEXT:    vpt.i16 ne, q4, zr
+; CHECK-NEXT:    vaddt.i16 q3, q3, q0
+; CHECK-NEXT:    vaddva.u16 r0, q3
 ; CHECK-NEXT:    uxth r0, r0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer
@@ -4110,123 +4077,120 @@ entry:
 define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) {
 ; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q2, #0x0
 ; CHECK-NEXT:    vmov.i8 q3, #0xff
 ; CHECK-NEXT:    vpsel q2, q3, q2
-; CHECK-NEXT:    vmov.u8 r1, q2[8]
+; CHECK-NEXT:    vmov.u8 r1, q2[0]
 ; CHECK-NEXT:    vmov.16 q3[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[9]
+; CHECK-NEXT:    vmov.u8 r1, q2[1]
 ; CHECK-NEXT:    vmov.16 q3[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[10]
+; CHECK-NEXT:    vmov.u8 r1, q2[2]
 ; CHECK-NEXT:    vmov.16 q3[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[11]
+; CHECK-NEXT:    vmov.u8 r1, q2[3]
 ; CHECK-NEXT:    vmov.16 q3[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[12]
+; CHECK-NEXT:    vmov.u8 r1, q2[4]
 ; CHECK-NEXT:    vmov.16 q3[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[13]
+; CHECK-NEXT:    vmov.u8 r1, q2[5]
 ; CHECK-NEXT:    vmov.16 q3[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[14]
+; CHECK-NEXT:    vmov.u8 r1, q2[6]
 ; CHECK-NEXT:    vmov.16 q3[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[15]
+; CHECK-NEXT:    vmov.u8 r1, q2[7]
 ; CHECK-NEXT:    vmov.16 q3[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[8]
+; CHECK-NEXT:    vmov.u8 r1, q1[0]
 ; CHECK-NEXT:    vcmp.i16 ne, q3, zr
 ; CHECK-NEXT:    vmov.16 q3[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[9]
+; CHECK-NEXT:    vmov.u8 r1, q1[1]
 ; CHECK-NEXT:    vmov.16 q3[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[10]
+; CHECK-NEXT:    vmov.u8 r1, q1[2]
 ; CHECK-NEXT:    vmov.16 q3[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[11]
+; CHECK-NEXT:    vmov.u8 r1, q1[3]
 ; CHECK-NEXT:    vmov.16 q3[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[12]
+; CHECK-NEXT:    vmov.u8 r1, q1[4]
 ; CHECK-NEXT:    vmov.16 q3[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[13]
+; CHECK-NEXT:    vmov.u8 r1, q1[5]
 ; CHECK-NEXT:    vmov.16 q3[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[14]
+; CHECK-NEXT:    vmov.u8 r1, q1[6]
 ; CHECK-NEXT:    vmov.16 q3[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[15]
+; CHECK-NEXT:    vmov.u8 r1, q1[7]
 ; CHECK-NEXT:    vmov.16 q3[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[8]
-; CHECK-NEXT:    vmovlb.s8 q5, q3
+; CHECK-NEXT:    vmov.u8 r1, q0[0]
+; CHECK-NEXT:    vmovlb.s8 q4, q3
 ; CHECK-NEXT:    vmov.16 q3[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    vmov.u8 r1, q0[1]
 ; CHECK-NEXT:    vmov.16 q3[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
+; CHECK-NEXT:    vmov.u8 r1, q0[2]
 ; CHECK-NEXT:    vmov.16 q3[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
+; CHECK-NEXT:    vmov.u8 r1, q0[3]
 ; CHECK-NEXT:    vmov.16 q3[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
 ; CHECK-NEXT:    vmov.16 q3[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    vmov.u8 r1, q0[5]
 ; CHECK-NEXT:    vmov.16 q3[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
+; CHECK-NEXT:    vmov.u8 r1, q0[6]
 ; CHECK-NEXT:    vmov.16 q3[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[7]
 ; CHECK-NEXT:    vmov.16 q3[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[0]
-; CHECK-NEXT:    vmovlb.s8 q6, q3
+; CHECK-NEXT:    vmov.u8 r1, q2[8]
+; CHECK-NEXT:    vmovlb.s8 q5, q3
 ; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmult.i16 q4, q6, q5
-; CHECK-NEXT:    vmov.16 q5[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[1]
-; CHECK-NEXT:    vmov.16 q5[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[2]
-; CHECK-NEXT:    vmov.16 q5[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[3]
-; CHECK-NEXT:    vmov.16 q5[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[4]
-; CHECK-NEXT:    vmov.16 q5[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[5]
-; CHECK-NEXT:    vmov.16 q5[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[6]
-; CHECK-NEXT:    vmov.16 q5[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q2[7]
-; CHECK-NEXT:    vmov.16 q5[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[0]
-; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[1]
-; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[2]
-; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[3]
-; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[4]
-; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[5]
-; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[6]
-; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q1[7]
-; CHECK-NEXT:    vmov.16 q2[7], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    vmovlb.s8 q1, q2
+; CHECK-NEXT:    vmult.i16 q3, q5, q4
+; CHECK-NEXT:    vmov.16 q4[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[9]
+; CHECK-NEXT:    vmov.16 q4[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[10]
+; CHECK-NEXT:    vmov.16 q4[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[11]
+; CHECK-NEXT:    vmov.16 q4[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[12]
+; CHECK-NEXT:    vmov.16 q4[4], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[13]
+; CHECK-NEXT:    vmov.16 q4[5], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[14]
+; CHECK-NEXT:    vmov.16 q4[6], r1
+; CHECK-NEXT:    vmov.u8 r1, q2[15]
+; CHECK-NEXT:    vmov.16 q4[7], r1
+; CHECK-NEXT:    vmov.u8 r1, q1[8]
 ; CHECK-NEXT:    vmov.16 q2[0], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov.u8 r1, q1[9]
 ; CHECK-NEXT:    vmov.16 q2[1], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
+; CHECK-NEXT:    vmov.u8 r1, q1[10]
 ; CHECK-NEXT:    vmov.16 q2[2], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
+; CHECK-NEXT:    vmov.u8 r1, q1[11]
 ; CHECK-NEXT:    vmov.16 q2[3], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.u8 r1, q1[12]
 ; CHECK-NEXT:    vmov.16 q2[4], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
+; CHECK-NEXT:    vmov.u8 r1, q1[13]
 ; CHECK-NEXT:    vmov.16 q2[5], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
+; CHECK-NEXT:    vmov.u8 r1, q1[14]
 ; CHECK-NEXT:    vmov.16 q2[6], r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
+; CHECK-NEXT:    vmov.u8 r1, q1[15]
 ; CHECK-NEXT:    vmov.16 q2[7], r1
-; CHECK-NEXT:    vmovlb.s8 q0, q2
-; CHECK-NEXT:    vpt.i16 ne, q5, zr
-; CHECK-NEXT:    vmult.i16 q3, q0, q1
-; CHECK-NEXT:    vadd.i16 q0, q3, q4
-; CHECK-NEXT:    vaddva.u16 r0, q0
+; CHECK-NEXT:    vmov.u8 r1, q0[8]
+; CHECK-NEXT:    vmov.16 q1[0], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[9]
+; CHECK-NEXT:    vmov.16 q1[1], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[10]
+; CHECK-NEXT:    vmov.16 q1[2], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[11]
+; CHECK-NEXT:    vmov.16 q1[3], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
+; CHECK-NEXT:    vmov.16 q1[4], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[13]
+; CHECK-NEXT:    vmov.16 q1[5], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[14]
+; CHECK-NEXT:    vmov.16 q1[6], r1
+; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.16 q1[7], r1
+; CHECK-NEXT:    vmullb.s8 q0, q1, q2
+; CHECK-NEXT:    vpt.i16 ne, q4, zr
+; CHECK-NEXT:    vaddt.i16 q3, q3, q0
+; CHECK-NEXT:    vaddva.u16 r0, q3
 ; CHECK-NEXT:    sxth r0, r0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %c = icmp eq <16 x i8> %b, zeroinitializer

diff  --git a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
index 6fdc116721f3..d8588c4dc6ed 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -3589,6 +3589,9 @@ static bool hasNullFragReference(DagInit *DI) {
   if (Operator->getName() == "null_frag") return true;
   // If any of the arguments reference the null fragment, return true.
   for (unsigned i = 0, e = DI->getNumArgs(); i != e; ++i) {
+    if (auto Arg = dyn_cast<DefInit>(DI->getArg(i)))
+      if (Arg->getDef()->getName() == "null_frag")
+        return true;
     DagInit *Arg = dyn_cast<DagInit>(DI->getArg(i));
     if (Arg && hasNullFragReference(Arg))
       return true;


        


More information about the llvm-commits mailing list