[llvm] deb72ce - [ARM] Better reductions

David Green via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 29 08:04:29 PDT 2020


Author: David Green
Date: 2020-06-29T16:04:13+01:00
New Revision: deb72ce29860f61fe91ddcf97e89abfc9544cf42

URL: https://github.com/llvm/llvm-project/commit/deb72ce29860f61fe91ddcf97e89abfc9544cf42
DIFF: https://github.com/llvm/llvm-project/commit/deb72ce29860f61fe91ddcf97e89abfc9544cf42.diff

LOG: [ARM] Better reductions

MVE has native reductions for integer add and min/max. The others need
to be expanded to a series of extract's and scalar operators to reduce
the vector into a single scalar. The default codegen for that expands
the reduction into a series of in-order operations.

This modifies that to something more suitable for MVE. The basic idea is
to use vector operations until there are 4 remaining items then switch
to pairwise operations. For example a v8f16 fadd reduction would become:
Y = VREV X
Z = ADD(X, Y)
z0 = Z[0] + Z[1]
z1 = Z[2] + Z[3]
return z0 + z1

The awkwardness (there is always some) comes in from something like a
v4f16, which is first legalized by adding identity values to the extra
lanes of the reduction, and which can then not be optimized away through
the vrev; fadd combo, the inserts remain. I've made sure they custom
lower so that we can produce the pairwise additions before the extra
values are added.

Differential Revision: https://reviews.llvm.org/D81397

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9c4cd10a9a13..56551a62f797 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -296,6 +296,10 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
     setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
     setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
+    setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
+    setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+    setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+    setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
 
     if (!HasMVEFP) {
       setOperationAction(ISD::SINT_TO_FP, VT, Expand);
@@ -345,6 +349,10 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
       setOperationAction(ISD::FMINNUM, VT, Legal);
       setOperationAction(ISD::FMAXNUM, VT, Legal);
       setOperationAction(ISD::FROUND, VT, Legal);
+      setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
 
       // No native support for these.
       setOperationAction(ISD::FDIV, VT, Expand);
@@ -362,6 +370,17 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     }
   }
 
+  // Custom Expand smaller than legal vector reductions to prevent false zero
+  // items being added.
+  setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
+
   // We 'support' these types up to bitcast/load/store level, regardless of
   // MVE integer-only / float support. Only doing FP data processing on the FP
   // vector types is inhibited at integer-only level.
@@ -9498,6 +9517,79 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
   return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
 }
 
+static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG,
+                              const ARMSubtarget *ST) {
+  if (!ST->hasMVEIntegerOps())
+    return SDValue();
+
+  SDLoc dl(Op);
+  unsigned BaseOpcode = 0;
+  switch (Op->getOpcode()) {
+  default: llvm_unreachable("Expected VECREDUCE opcode");
+  case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
+  case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
+  case ISD::VECREDUCE_MUL:  BaseOpcode = ISD::MUL; break;
+  case ISD::VECREDUCE_AND:  BaseOpcode = ISD::AND; break;
+  case ISD::VECREDUCE_OR:   BaseOpcode = ISD::OR; break;
+  case ISD::VECREDUCE_XOR:  BaseOpcode = ISD::XOR; break;
+  case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
+  case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
+  }
+
+  SDValue Op0 = Op->getOperand(0);
+  EVT VT = Op0.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumActiveLanes = NumElts;
+
+  assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
+          NumActiveLanes == 2) &&
+         "Only expected a power 2 vector size");
+
+  // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
+  // allows us to easily extract vector elements from the lanes.
+  while (NumActiveLanes > 4) {
+    unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
+    SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
+    Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
+    NumActiveLanes /= 2;
+  }
+
+  SDValue Res;
+  if (NumActiveLanes == 4) {
+    // The remaining 4 elements are summed sequentially
+    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                              DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
+    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                              DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
+    SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                              DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
+    SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                              DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
+    SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
+    SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
+    Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
+  } else {
+    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                              DAG.getConstant(0, dl, MVT::i32));
+    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                              DAG.getConstant(1, dl, MVT::i32));
+    Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
+  }
+
+  // Result type may be wider than element type.
+  if (EltVT != Op->getValueType(0))
+    Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
+  return Res;
+}
+
+static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,
+                               const ARMSubtarget *ST) {
+  if (!ST->hasMVEFloatOps())
+    return SDValue();
+  return LowerVecReduce(Op, DAG, ST);
+}
+
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
     // Acquire/Release load/store is not legal for targets without a dmb or
@@ -9702,6 +9794,16 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerSTORE(Op, DAG, Subtarget);
   case ISD::MLOAD:
     return LowerMLOAD(Op, DAG);
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+    return LowerVecReduce(Op, DAG, Subtarget);
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAX:
+    return LowerVecReduceF(Op, DAG, Subtarget);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
index 650db38d0089..fc06181978b7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-bit.ll
@@ -16,12 +16,12 @@ entry:
 define arm_aapcs_vfpcc i32 @and_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: and_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    ands r0, r1
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -33,12 +33,12 @@ define arm_aapcs_vfpcc i32 @and_v8i32(<8 x i32> %x) {
 ; CHECK-LABEL: and_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    ands r0, r1
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -49,12 +49,12 @@ entry:
 define arm_aapcs_vfpcc i16 @and_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: and_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    ands r0, r1
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -65,20 +65,14 @@ entry:
 define arm_aapcs_vfpcc i16 @and_v8i16(<8 x i16> %x) {
 ; CHECK-LABEL: and_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    ands r0, r1
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -90,20 +84,14 @@ define arm_aapcs_vfpcc i16 @and_v16i16(<16 x i16> %x) {
 ; CHECK-LABEL: and_v16i16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    ands r0, r1
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -114,20 +102,14 @@ entry:
 define arm_aapcs_vfpcc i8 @and_v8i8(<8 x i8> %x) {
 ; CHECK-LABEL: and_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    ands r0, r1
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -138,36 +120,16 @@ entry:
 define arm_aapcs_vfpcc i8 @and_v16i8(<16 x i8> %x) {
 ; CHECK-LABEL: and_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    ands r0, r1
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
 ; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -179,36 +141,16 @@ define arm_aapcs_vfpcc i8 @and_v32i8(<32 x i8> %x) {
 ; CHECK-LABEL: and_v32i8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    ands r0, r1
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
 ; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    ands r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -273,12 +215,12 @@ entry:
 define arm_aapcs_vfpcc i32 @and_v4i32_acc(<4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: and_v4i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ands r1, r2
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    ands r2, r3
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
@@ -292,12 +234,12 @@ define arm_aapcs_vfpcc i32 @and_v8i32_acc(<8 x i32> %x, i32 %y) {
 ; CHECK-LABEL: and_v8i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ands r1, r2
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    ands r2, r3
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
@@ -310,12 +252,12 @@ entry:
 define arm_aapcs_vfpcc i16 @and_v4i16_acc(<4 x i16> %x, i16 %y) {
 ; CHECK-LABEL: and_v4i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    ands r1, r2
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    ands r2, r3
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
@@ -328,20 +270,14 @@ entry:
 define arm_aapcs_vfpcc i16 @and_v8i16_acc(<8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: and_v8i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    ands r1, r2
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    ands r2, r3
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
@@ -355,20 +291,14 @@ define arm_aapcs_vfpcc i16 @and_v16i16_acc(<16 x i16> %x, i16 %y) {
 ; CHECK-LABEL: and_v16i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    ands r1, r2
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    ands r2, r3
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
@@ -381,20 +311,14 @@ entry:
 define arm_aapcs_vfpcc i8 @and_v8i8_acc(<8 x i8> %x, i8 %y) {
 ; CHECK-LABEL: and_v8i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    ands r1, r2
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    ands r2, r3
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
@@ -407,36 +331,16 @@ entry:
 define arm_aapcs_vfpcc i8 @and_v16i8_acc(<16 x i8> %x, i8 %y) {
 ; CHECK-LABEL: and_v16i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    ands r1, r2
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    vmov.u8 r2, q0[8]
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
+; CHECK-NEXT:    ands r2, r3
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
@@ -450,36 +354,16 @@ define arm_aapcs_vfpcc i8 @and_v32i8_acc(<32 x i8> %x, i8 %y) {
 ; CHECK-LABEL: and_v32i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    ands r1, r2
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    vand q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    vmov.u8 r2, q0[8]
 ; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    ands r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
+; CHECK-NEXT:    ands r2, r3
 ; CHECK-NEXT:    ands r1, r2
 ; CHECK-NEXT:    ands r0, r1
 ; CHECK-NEXT:    bx lr
@@ -553,12 +437,12 @@ entry:
 define arm_aapcs_vfpcc i32 @or_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: or_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -570,12 +454,12 @@ define arm_aapcs_vfpcc i32 @or_v8i32(<8 x i32> %x) {
 ; CHECK-LABEL: or_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -586,12 +470,12 @@ entry:
 define arm_aapcs_vfpcc i16 @or_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: or_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -602,20 +486,14 @@ entry:
 define arm_aapcs_vfpcc i16 @or_v8i16(<8 x i16> %x) {
 ; CHECK-LABEL: or_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -627,20 +505,14 @@ define arm_aapcs_vfpcc i16 @or_v16i16(<16 x i16> %x) {
 ; CHECK-LABEL: or_v16i16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -651,20 +523,14 @@ entry:
 define arm_aapcs_vfpcc i8 @or_v8i8(<8 x i8> %x) {
 ; CHECK-LABEL: or_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -675,36 +541,16 @@ entry:
 define arm_aapcs_vfpcc i8 @or_v16i8(<16 x i8> %x) {
 ; CHECK-LABEL: or_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
 ; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -716,36 +562,16 @@ define arm_aapcs_vfpcc i8 @or_v32i8(<32 x i8> %x) {
 ; CHECK-LABEL: or_v32i8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
 ; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    orrs r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -810,12 +636,12 @@ entry:
 define arm_aapcs_vfpcc i32 @or_v4i32_acc(<4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: or_v4i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
@@ -829,12 +655,12 @@ define arm_aapcs_vfpcc i32 @or_v8i32_acc(<8 x i32> %x, i32 %y) {
 ; CHECK-LABEL: or_v8i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
@@ -847,12 +673,12 @@ entry:
 define arm_aapcs_vfpcc i16 @or_v4i16_acc(<4 x i16> %x, i16 %y) {
 ; CHECK-LABEL: or_v4i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
@@ -865,20 +691,14 @@ entry:
 define arm_aapcs_vfpcc i16 @or_v8i16_acc(<8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: or_v8i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
@@ -892,20 +712,14 @@ define arm_aapcs_vfpcc i16 @or_v16i16_acc(<16 x i16> %x, i16 %y) {
 ; CHECK-LABEL: or_v16i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
@@ -918,20 +732,14 @@ entry:
 define arm_aapcs_vfpcc i8 @or_v8i8_acc(<8 x i8> %x, i8 %y) {
 ; CHECK-LABEL: or_v8i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
@@ -944,36 +752,16 @@ entry:
 define arm_aapcs_vfpcc i8 @or_v16i8_acc(<16 x i8> %x, i8 %y) {
 ; CHECK-LABEL: or_v16i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    vmov.u8 r2, q0[8]
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
+; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
@@ -987,36 +775,16 @@ define arm_aapcs_vfpcc i8 @or_v32i8_acc(<32 x i8> %x, i8 %y) {
 ; CHECK-LABEL: or_v32i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    orrs r1, r2
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    vorr q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    vmov.u8 r2, q0[8]
 ; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    orrs r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
+; CHECK-NEXT:    orrs r2, r3
 ; CHECK-NEXT:    orrs r1, r2
 ; CHECK-NEXT:    orrs r0, r1
 ; CHECK-NEXT:    bx lr
@@ -1090,12 +858,12 @@ entry:
 define arm_aapcs_vfpcc i32 @xor_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: xor_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1107,12 +875,12 @@ define arm_aapcs_vfpcc i32 @xor_v8i32(<8 x i32> %x) {
 ; CHECK-LABEL: xor_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    veor q0, q0, q1
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1123,12 +891,12 @@ entry:
 define arm_aapcs_vfpcc i16 @xor_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: xor_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1139,20 +907,14 @@ entry:
 define arm_aapcs_vfpcc i16 @xor_v8i16(<8 x i16> %x) {
 ; CHECK-LABEL: xor_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1164,20 +926,14 @@ define arm_aapcs_vfpcc i16 @xor_v16i16(<16 x i16> %x) {
 ; CHECK-LABEL: xor_v16i16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    veor q0, q0, q1
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1188,20 +944,14 @@ entry:
 define arm_aapcs_vfpcc i8 @xor_v8i8(<8 x i8> %x) {
 ; CHECK-LABEL: xor_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1212,36 +962,16 @@ entry:
 define arm_aapcs_vfpcc i8 @xor_v16i8(<16 x i8> %x) {
 ; CHECK-LABEL: xor_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
 ; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1253,36 +983,16 @@ define arm_aapcs_vfpcc i8 @xor_v32i8(<32 x i8> %x) {
 ; CHECK-LABEL: xor_v32i8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    veor q0, q0, q1
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    eors r0, r1
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
 ; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    eors r0, r1
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1347,12 +1057,12 @@ entry:
 define arm_aapcs_vfpcc i32 @xor_v4i32_acc(<4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: xor_v4i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    eors r2, r3
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
@@ -1366,12 +1076,12 @@ define arm_aapcs_vfpcc i32 @xor_v8i32_acc(<8 x i32> %x, i32 %y) {
 ; CHECK-LABEL: xor_v8i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    veor q0, q0, q1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    eors r2, r3
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
@@ -1384,12 +1094,12 @@ entry:
 define arm_aapcs_vfpcc i16 @xor_v4i16_acc(<4 x i16> %x, i16 %y) {
 ; CHECK-LABEL: xor_v4i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    eors r2, r3
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
@@ -1402,20 +1112,14 @@ entry:
 define arm_aapcs_vfpcc i16 @xor_v8i16_acc(<8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: xor_v8i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    eors r2, r3
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
@@ -1429,20 +1133,14 @@ define arm_aapcs_vfpcc i16 @xor_v16i16_acc(<16 x i16> %x, i16 %y) {
 ; CHECK-LABEL: xor_v16i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    veor q0, q0, q1
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    eors r2, r3
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
@@ -1455,20 +1153,14 @@ entry:
 define arm_aapcs_vfpcc i8 @xor_v8i8_acc(<8 x i8> %x, i8 %y) {
 ; CHECK-LABEL: xor_v8i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    eors r2, r3
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
@@ -1481,36 +1173,16 @@ entry:
 define arm_aapcs_vfpcc i8 @xor_v16i8_acc(<16 x i8> %x, i8 %y) {
 ; CHECK-LABEL: xor_v16i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    vmov.u8 r2, q0[8]
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
+; CHECK-NEXT:    eors r2, r3
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr
@@ -1524,36 +1196,16 @@ define arm_aapcs_vfpcc i8 @xor_v32i8_acc(<32 x i8> %x, i8 %y) {
 ; CHECK-LABEL: xor_v32i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    veor q0, q0, q1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    veor q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    vmov.u8 r2, q0[8]
 ; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    eors r1, r2
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
+; CHECK-NEXT:    eors r2, r3
 ; CHECK-NEXT:    eors r1, r2
 ; CHECK-NEXT:    eors r0, r1
 ; CHECK-NEXT:    bx lr

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
index d8d40e5b5186..a1f25e0f3334 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
@@ -3,31 +3,51 @@
 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16,+fp64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOFP
 
 define arm_aapcs_vfpcc float @fadd_v2f32(<2 x float> %x, float %y) {
-; CHECK-LABEL: fadd_v2f32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    vldr s2, .LCPI0_0
-; CHECK-NEXT:    vadd.f32 s0, s0, s2
-; CHECK-NEXT:    vadd.f32 s0, s0, s2
-; CHECK-NEXT:    vadd.f32 s0, s4, s0
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI0_0:
-; CHECK-NEXT:    .long 0x00000000 @ float 0
+; CHECK-FP-LABEL: fadd_v2f32:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vldr s2, .LCPI0_0
+; CHECK-FP-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-FP-NEXT:    vadd.f32 s0, s4, s0
+; CHECK-FP-NEXT:    bx lr
+; CHECK-FP-NEXT:    .p2align 2
+; CHECK-FP-NEXT:  @ %bb.1:
+; CHECK-FP-NEXT:  .LCPI0_0:
+; CHECK-FP-NEXT:    .long 0x00000000 @ float 0
+;
+; CHECK-NOFP-LABEL: fadd_v2f32:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vldr s2, .LCPI0_0
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s4, s0
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 2
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI0_0:
+; CHECK-NOFP-NEXT:    .long 0x00000000 @ float 0
 entry:
   %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float %y, <2 x float> %x)
   ret float %z
 }
 
 define arm_aapcs_vfpcc float @fadd_v4f32(<4 x float> %x, float %y) {
-; CHECK-LABEL: fadd_v4f32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f32 s6, s0, s1
-; CHECK-NEXT:    vadd.f32 s6, s6, s2
-; CHECK-NEXT:    vadd.f32 s0, s6, s3
-; CHECK-NEXT:    vadd.f32 s0, s4, s0
-; CHECK-NEXT:    bx lr
+; CHECK-FP-LABEL: fadd_v4f32:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vadd.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vadd.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vadd.f32 s0, s4, s0
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fadd_v4f32:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vadd.f32 s6, s0, s1
+; CHECK-NOFP-NEXT:    vadd.f32 s6, s6, s2
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s4, s0
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float %y, <4 x float> %x)
   ret float %z
@@ -37,9 +57,9 @@ define arm_aapcs_vfpcc float @fadd_v8f32(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fadd_v8f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vadd.f32 s4, s0, s1
-; CHECK-FP-NEXT:    vadd.f32 s4, s4, s2
-; CHECK-FP-NEXT:    vadd.f32 s0, s4, s3
+; CHECK-FP-NEXT:    vadd.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vadd.f32 s0, s0, s4
 ; CHECK-FP-NEXT:    vadd.f32 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
@@ -59,27 +79,75 @@ entry:
   ret float %z
 }
 
+define arm_aapcs_vfpcc void @fadd_v2f16(<2 x half> %x, half* %yy) {
+; CHECK-FP-LABEL: fadd_v2f16:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmovx.f16 s4, s0
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP-NEXT:    vadd.f16 s0, s2, s0
+; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fadd_v2f16:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vldr.16 s2, .LCPI3_0
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s2, s0
+; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 1
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI3_0:
+; CHECK-NOFP-NEXT:    .short 0x0000 @ half 0
+entry:
+  %y = load half, half* %yy
+  %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half %y, <2 x half> %x)
+  store half %z, half* %yy
+  ret void
+}
+
 define arm_aapcs_vfpcc void @fadd_v4f16(<4 x half> %x, half* %yy) {
-; CHECK-LABEL: fadd_v4f16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vadd.f16 s4, s0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vadd.f16 s4, s4, s1
-; CHECK-NEXT:    vldr.16 s2, .LCPI3_0
-; CHECK-NEXT:    vadd.f16 s0, s4, s0
-; CHECK-NEXT:    vadd.f16 s0, s0, s2
-; CHECK-NEXT:    vadd.f16 s0, s0, s2
-; CHECK-NEXT:    vadd.f16 s0, s0, s2
-; CHECK-NEXT:    vadd.f16 s0, s0, s2
-; CHECK-NEXT:    vldr.16 s2, [r0]
-; CHECK-NEXT:    vadd.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 1
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI3_0:
-; CHECK-NEXT:    .short 0x0000 @ half 0
+; CHECK-FP-LABEL: fadd_v4f16:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmovx.f16 s4, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vadd.f16 s4, s1, s4
+; CHECK-FP-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vadd.f16 s0, s2, s0
+; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fadd_v4f16:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s4, s1
+; CHECK-NOFP-NEXT:    vldr.16 s2, .LCPI4_0
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s2, s0
+; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 1
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI4_0:
+; CHECK-NOFP-NEXT:    .short 0x0000 @ half 0
 entry:
   %y = load half, half* %yy
   %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half %y, <4 x half> %x)
@@ -88,23 +156,35 @@ entry:
 }
 
 define arm_aapcs_vfpcc void @fadd_v8f16(<8 x half> %x, half* %yy) {
-; CHECK-LABEL: fadd_v8f16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vadd.f16 s4, s0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vadd.f16 s4, s4, s1
-; CHECK-NEXT:    vadd.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s2
-; CHECK-NEXT:    vadd.f16 s4, s4, s2
-; CHECK-NEXT:    vldr.16 s2, [r0]
-; CHECK-NEXT:    vadd.f16 s4, s4, s6
-; CHECK-NEXT:    vadd.f16 s4, s4, s3
-; CHECK-NEXT:    vadd.f16 s0, s4, s0
-; CHECK-NEXT:    vadd.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-FP-LABEL: fadd_v8f16:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vrev32.16 q1, q0
+; CHECK-FP-NEXT:    vadd.f16 q0, q0, q1
+; CHECK-FP-NEXT:    vadd.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s1
+; CHECK-FP-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vadd.f16 s0, s2, s0
+; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fadd_v8f16:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s4, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s4, s2
+; CHECK-NOFP-NEXT:    vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s4, s6
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s4, s3
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s2, s0
+; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
   %z = call fast half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half %y, <8 x half> %x)
@@ -116,18 +196,12 @@ define arm_aapcs_vfpcc void @fadd_v16f16(<16 x half> %x, half* %yy) {
 ; CHECK-FP-LABEL: fadd_v16f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmovx.f16 s4, s0
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vadd.f16 s4, s0, s4
-; CHECK-FP-NEXT:    vmovx.f16 s0, s3
-; CHECK-FP-NEXT:    vadd.f16 s4, s4, s1
-; CHECK-FP-NEXT:    vadd.f16 s4, s4, s6
-; CHECK-FP-NEXT:    vmovx.f16 s6, s2
-; CHECK-FP-NEXT:    vadd.f16 s4, s4, s2
+; CHECK-FP-NEXT:    vrev32.16 q1, q0
+; CHECK-FP-NEXT:    vadd.f16 q0, q0, q1
+; CHECK-FP-NEXT:    vadd.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s1
 ; CHECK-FP-NEXT:    vldr.16 s2, [r0]
-; CHECK-FP-NEXT:    vadd.f16 s4, s4, s6
-; CHECK-FP-NEXT:    vadd.f16 s4, s4, s3
-; CHECK-FP-NEXT:    vadd.f16 s0, s4, s0
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s4
 ; CHECK-FP-NEXT:    vadd.f16 s0, s2, s0
 ; CHECK-FP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-FP-NEXT:    bx lr
@@ -365,5 +439,6 @@ declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v2f32(float, <2 x flo
 declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float, <4 x float>)
 declare float @llvm.experimental.vector.reduce.v2.fadd.f32.v8f32(float, <8 x float>)
 declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v16f16(half, <16 x half>)
+declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v2f16(half, <2 x half>)
 declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v4f16(half, <4 x half>)
 declare half @llvm.experimental.vector.reduce.v2.fadd.f16.v8f16(half, <8 x half>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
index 72636ad037db..6936b7ea3ad1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
@@ -4,29 +4,47 @@
 
 ; FIXME minnum nonan X, +Inf -> X   ?
 define arm_aapcs_vfpcc float @fmin_v2f32(<2 x float> %x) {
-; CHECK-LABEL: fmin_v2f32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldr s4, .LCPI0_0
-; CHECK-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-NEXT:    vminnm.f32 s0, s0, s4
-; CHECK-NEXT:    vminnm.f32 s0, s0, s4
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI0_0:
-; CHECK-NEXT:    .long 0x7f800000 @ float +Inf
+; CHECK-FP-LABEL: fmin_v2f32:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vldr s4, .LCPI0_0
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    bx lr
+; CHECK-FP-NEXT:    .p2align 2
+; CHECK-FP-NEXT:  @ %bb.1:
+; CHECK-FP-NEXT:  .LCPI0_0:
+; CHECK-FP-NEXT:    .long 0x7f800000 @ float +Inf
+;
+; CHECK-NOFP-LABEL: fmin_v2f32:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vldr s4, .LCPI0_0
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 2
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI0_0:
+; CHECK-NOFP-NEXT:    .long 0x7f800000 @ float +Inf
 entry:
   %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
   ret float %z
 }
 
 define arm_aapcs_vfpcc float @fmin_v4f32(<4 x float> %x) {
-; CHECK-LABEL: fmin_v4f32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vminnm.f32 s4, s0, s1
-; CHECK-NEXT:    vminnm.f32 s4, s4, s2
-; CHECK-NEXT:    vminnm.f32 s0, s4, s3
-; CHECK-NEXT:    bx lr
+; CHECK-FP-LABEL: fmin_v4f32:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v4f32:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vminnm.f32 s4, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f32 s4, s4, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s3
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
   ret float %z
@@ -37,9 +55,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32(<8 x float> %x) {
 ; CHECK-FP-LABEL: fmin_v8f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vminnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f32 s4, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s4, s4, s2
-; CHECK-FP-NEXT:    vminnm.f32 s0, s4, s3
+; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f32:
@@ -66,43 +84,61 @@ entry:
 }
 
 define arm_aapcs_vfpcc half @fmin_v4f16(<4 x half> %x) {
-; CHECK-LABEL: fmin_v4f16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-NEXT:    vldr.16 s2, .LCPI3_0
-; CHECK-NEXT:    vminnm.f16 s0, s4, s0
-; CHECK-NEXT:    vminnm.f16 s0, s0, s2
-; CHECK-NEXT:    vminnm.f16 s0, s0, s2
-; CHECK-NEXT:    vminnm.f16 s0, s0, s2
-; CHECK-NEXT:    vminnm.f16 s0, s0, s2
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 1
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI3_0:
-; CHECK-NEXT:    .short 0x7c00 @ half +Inf
+; CHECK-FP-LABEL: fmin_v4f16:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmovx.f16 s4, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vminnm.f16 s4, s1, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v4f16:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT:    vldr.16 s2, .LCPI3_0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 1
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI3_0:
+; CHECK-NOFP-NEXT:    .short 0x7c00 @ half +Inf
 entry:
   %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
   ret half %z
 }
 
 define arm_aapcs_vfpcc half @fmin_v8f16(<8 x half> %x) {
-; CHECK-LABEL: fmin_v8f16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s2
-; CHECK-NEXT:    vminnm.f16 s4, s4, s2
-; CHECK-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-NEXT:    vminnm.f16 s4, s4, s3
-; CHECK-NEXT:    vminnm.f16 s0, s4, s0
-; CHECK-NEXT:    bx lr
+; CHECK-FP-LABEL: fmin_v8f16:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vrev32.16 q1, q0
+; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
+; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v8f16:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
   ret half %z
@@ -112,17 +148,11 @@ define arm_aapcs_vfpcc half @fmin_v16f16(<16 x half> %x) {
 ; CHECK-FP-LABEL: fmin_v16f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmovx.f16 s4, s0
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-FP-NEXT:    vmovx.f16 s0, s3
-; CHECK-FP-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-FP-NEXT:    vmovx.f16 s6, s2
-; CHECK-FP-NEXT:    vminnm.f16 s4, s4, s2
-; CHECK-FP-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-FP-NEXT:    vminnm.f16 s4, s4, s3
-; CHECK-FP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-FP-NEXT:    vrev32.16 q1, q0
+; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
+; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v16f16:
@@ -502,18 +532,30 @@ entry:
 }
 
 define arm_aapcs_vfpcc float @fmin_v2f32_acc(<2 x float> %x, float %y) {
-; CHECK-LABEL: fmin_v2f32_acc:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldr s6, .LCPI18_0
-; CHECK-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-NEXT:    vminnm.f32 s0, s0, s6
-; CHECK-NEXT:    vminnm.f32 s0, s0, s6
-; CHECK-NEXT:    vminnm.f32 s0, s4, s0
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI18_0:
-; CHECK-NEXT:    .long 0x7f800000 @ float +Inf
+; CHECK-FP-LABEL: fmin_v2f32_acc:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vldr s6, .LCPI18_0
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f32 s0, s4, s0
+; CHECK-FP-NEXT:    bx lr
+; CHECK-FP-NEXT:    .p2align 2
+; CHECK-FP-NEXT:  @ %bb.1:
+; CHECK-FP-NEXT:  .LCPI18_0:
+; CHECK-FP-NEXT:    .long 0x7f800000 @ float +Inf
+;
+; CHECK-NOFP-LABEL: fmin_v2f32_acc:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vldr s6, .LCPI18_0
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s0
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 2
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI18_0:
+; CHECK-NOFP-NEXT:    .long 0x7f800000 @ float +Inf
 entry:
   %z = call fast float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float> %x)
   %c = fcmp fast olt float %y, %z
@@ -522,13 +564,21 @@ entry:
 }
 
 define arm_aapcs_vfpcc float @fmin_v4f32_acc(<4 x float> %x, float %y) {
-; CHECK-LABEL: fmin_v4f32_acc:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vminnm.f32 s6, s0, s1
-; CHECK-NEXT:    vminnm.f32 s6, s6, s2
-; CHECK-NEXT:    vminnm.f32 s0, s6, s3
-; CHECK-NEXT:    vminnm.f32 s0, s4, s0
-; CHECK-NEXT:    bx lr
+; CHECK-FP-LABEL: fmin_v4f32_acc:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vminnm.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f32 s0, s4, s0
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v4f32_acc:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vminnm.f32 s6, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f32 s6, s6, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s0
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float> %x)
   %c = fcmp fast olt float %y, %z
@@ -540,9 +590,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmin_v8f32_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vminnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f32 s4, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s4, s4, s2
-; CHECK-FP-NEXT:    vminnm.f32 s0, s4, s3
+; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
@@ -573,26 +623,38 @@ entry:
 }
 
 define arm_aapcs_vfpcc void @fmin_v4f16_acc(<4 x half> %x, half* %yy) {
-; CHECK-LABEL: fmin_v4f16_acc:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-NEXT:    vldr.16 s2, .LCPI21_0
-; CHECK-NEXT:    vminnm.f16 s0, s4, s0
-; CHECK-NEXT:    vminnm.f16 s0, s0, s2
-; CHECK-NEXT:    vminnm.f16 s0, s0, s2
-; CHECK-NEXT:    vminnm.f16 s0, s0, s2
-; CHECK-NEXT:    vminnm.f16 s0, s0, s2
-; CHECK-NEXT:    vldr.16 s2, [r0]
-; CHECK-NEXT:    vminnm.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 1
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI21_0:
-; CHECK-NEXT:    .short 0x7c00 @ half +Inf
+; CHECK-FP-LABEL: fmin_v4f16_acc:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmovx.f16 s4, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f16 s4, s1, s4
+; CHECK-FP-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s2, s0
+; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v4f16_acc:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT:    vldr.16 s2, .LCPI21_0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s2, s0
+; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 1
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI21_0:
+; CHECK-NOFP-NEXT:    .short 0x7c00 @ half +Inf
 entry:
   %y = load half, half* %yy
   %z = call fast half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half> %x)
@@ -602,24 +664,74 @@ entry:
   ret void
 }
 
+define arm_aapcs_vfpcc void @fmin_v2f16_acc(<2 x half> %x, half* %yy) {
+; CHECK-FP-LABEL: fmin_v2f16_acc:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmovx.f16 s4, s0
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP-NEXT:    vminnm.f16 s0, s2, s0
+; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v2f16_acc:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vldr.16 s2, .LCPI22_0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s2, s0
+; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 1
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI22_0:
+; CHECK-NOFP-NEXT:    .short 0x7c00 @ half +Inf
+entry:
+  %y = load half, half* %yy
+  %z = call fast half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half> %x)
+  %c = fcmp fast olt half %y, %z
+  %r = select i1 %c, half %y, half %z
+  store half %r, half* %yy
+  ret void
+}
+
 define arm_aapcs_vfpcc void @fmin_v8f16_acc(<8 x half> %x, half* %yy) {
-; CHECK-LABEL: fmin_v8f16_acc:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s2
-; CHECK-NEXT:    vminnm.f16 s4, s4, s2
-; CHECK-NEXT:    vldr.16 s2, [r0]
-; CHECK-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-NEXT:    vminnm.f16 s4, s4, s3
-; CHECK-NEXT:    vminnm.f16 s0, s4, s0
-; CHECK-NEXT:    vminnm.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-FP-LABEL: fmin_v8f16_acc:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vrev32.16 q1, q0
+; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
+; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-FP-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s2, s0
+; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmin_v8f16_acc:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s2
+; CHECK-NOFP-NEXT:    vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s2, s0
+; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
   %z = call fast half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half> %x)
@@ -633,18 +745,12 @@ define arm_aapcs_vfpcc void @fmin_v16f16_acc(<16 x half> %x, half* %yy) {
 ; CHECK-FP-LABEL: fmin_v16f16_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmovx.f16 s4, s0
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-FP-NEXT:    vmovx.f16 s0, s3
-; CHECK-FP-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-FP-NEXT:    vmovx.f16 s6, s2
-; CHECK-FP-NEXT:    vminnm.f16 s4, s4, s2
+; CHECK-FP-NEXT:    vrev32.16 q1, q0
+; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
+; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
 ; CHECK-FP-NEXT:    vldr.16 s2, [r0]
-; CHECK-FP-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-FP-NEXT:    vminnm.f16 s4, s4, s3
-; CHECK-FP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s2, s0
 ; CHECK-FP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-FP-NEXT:    bx lr
@@ -1115,29 +1221,47 @@ entry:
 }
 
 define arm_aapcs_vfpcc float @fmax_v2f32(<2 x float> %x) {
-; CHECK-LABEL: fmax_v2f32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldr s4, .LCPI36_0
-; CHECK-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-NEXT:    vmaxnm.f32 s0, s0, s4
-; CHECK-NEXT:    vmaxnm.f32 s0, s0, s4
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI36_0:
-; CHECK-NEXT:    .long 0xff800000 @ float -Inf
+; CHECK-FP-LABEL: fmax_v2f32:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vldr s4, .LCPI37_0
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    bx lr
+; CHECK-FP-NEXT:    .p2align 2
+; CHECK-FP-NEXT:  @ %bb.1:
+; CHECK-FP-NEXT:  .LCPI37_0:
+; CHECK-FP-NEXT:    .long 0xff800000 @ float -Inf
+;
+; CHECK-NOFP-LABEL: fmax_v2f32:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vldr s4, .LCPI37_0
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 2
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI37_0:
+; CHECK-NOFP-NEXT:    .long 0xff800000 @ float -Inf
 entry:
   %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
   ret float %z
 }
 
 define arm_aapcs_vfpcc float @fmax_v4f32(<4 x float> %x) {
-; CHECK-LABEL: fmax_v4f32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmaxnm.f32 s4, s0, s1
-; CHECK-NEXT:    vmaxnm.f32 s4, s4, s2
-; CHECK-NEXT:    vmaxnm.f32 s0, s4, s3
-; CHECK-NEXT:    bx lr
+; CHECK-FP-LABEL: fmax_v4f32:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v4f32:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s4, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s4, s4, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s3
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
   ret float %z
@@ -1147,9 +1271,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32(<8 x float> %x) {
 ; CHECK-FP-LABEL: fmax_v8f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmaxnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s4, s2
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s4, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f32:
@@ -1176,43 +1300,61 @@ entry:
 }
 
 define arm_aapcs_vfpcc half @fmax_v4f16(<4 x half> %x) {
-; CHECK-LABEL: fmax_v4f16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-NEXT:    vldr.16 s2, .LCPI39_0
-; CHECK-NEXT:    vmaxnm.f16 s0, s4, s0
-; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 1
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI39_0:
-; CHECK-NEXT:    .short 0xfc00 @ half -Inf
+; CHECK-FP-LABEL: fmax_v4f16:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmovx.f16 s4, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vmaxnm.f16 s4, s1, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v4f16:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT:    vldr.16 s2, .LCPI40_0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 1
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI40_0:
+; CHECK-NOFP-NEXT:    .short 0xfc00 @ half -Inf
 entry:
   %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
   ret half %z
 }
 
 define arm_aapcs_vfpcc half @fmax_v8f16(<8 x half> %x) {
-; CHECK-LABEL: fmax_v8f16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s2
-; CHECK-NEXT:    vmaxnm.f16 s4, s4, s2
-; CHECK-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-NEXT:    vmaxnm.f16 s4, s4, s3
-; CHECK-NEXT:    vmaxnm.f16 s0, s4, s0
-; CHECK-NEXT:    bx lr
+; CHECK-FP-LABEL: fmax_v8f16:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vrev32.16 q1, q0
+; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
+; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v8f16:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
   ret half %z
@@ -1222,17 +1364,11 @@ define arm_aapcs_vfpcc half @fmax_v16f16(<16 x half> %x) {
 ; CHECK-FP-LABEL: fmax_v16f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmovx.f16 s4, s0
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-FP-NEXT:    vmovx.f16 s0, s3
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-FP-NEXT:    vmovx.f16 s6, s2
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s4, s2
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s4, s3
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-FP-NEXT:    vrev32.16 q1, q0
+; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
+; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v16f16:
@@ -1610,18 +1746,30 @@ entry:
 }
 
 define arm_aapcs_vfpcc float @fmax_v2f32_acc(<2 x float> %x, float %y) {
-; CHECK-LABEL: fmax_v2f32_acc:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldr s6, .LCPI54_0
-; CHECK-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-NEXT:    vmaxnm.f32 s0, s0, s6
-; CHECK-NEXT:    vmaxnm.f32 s0, s0, s6
-; CHECK-NEXT:    vmaxnm.f32 s0, s4, s0
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 2
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI54_0:
-; CHECK-NEXT:    .long 0xff800000 @ float -Inf
+; CHECK-FP-LABEL: fmax_v2f32_acc:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vldr s6, .LCPI55_0
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s4, s0
+; CHECK-FP-NEXT:    bx lr
+; CHECK-FP-NEXT:    .p2align 2
+; CHECK-FP-NEXT:  @ %bb.1:
+; CHECK-FP-NEXT:  .LCPI55_0:
+; CHECK-FP-NEXT:    .long 0xff800000 @ float -Inf
+;
+; CHECK-NOFP-LABEL: fmax_v2f32_acc:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vldr s6, .LCPI55_0
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s0
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 2
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI55_0:
+; CHECK-NOFP-NEXT:    .long 0xff800000 @ float -Inf
 entry:
   %z = call fast float @llvm.experimental.vector.reduce.fmax.v2f32(<2 x float> %x)
   %c = fcmp fast ogt float %y, %z
@@ -1630,13 +1778,21 @@ entry:
 }
 
 define arm_aapcs_vfpcc float @fmax_v4f32_acc(<4 x float> %x, float %y) {
-; CHECK-LABEL: fmax_v4f32_acc:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmaxnm.f32 s6, s0, s1
-; CHECK-NEXT:    vmaxnm.f32 s6, s6, s2
-; CHECK-NEXT:    vmaxnm.f32 s0, s6, s3
-; CHECK-NEXT:    vmaxnm.f32 s0, s4, s0
-; CHECK-NEXT:    bx lr
+; CHECK-FP-LABEL: fmax_v4f32_acc:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmaxnm.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s4, s0
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v4f32_acc:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s6, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s6, s6, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s0
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.experimental.vector.reduce.fmax.v4f32(<4 x float> %x)
   %c = fcmp fast ogt float %y, %z
@@ -1648,9 +1804,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmax_v8f32_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmaxnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s4, s2
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s4, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
@@ -1680,27 +1836,77 @@ entry:
   ret float %r
 }
 
+define arm_aapcs_vfpcc void @fmax_v2f16_acc(<2 x half> %x, half* %yy) {
+; CHECK-FP-LABEL: fmax_v2f16_acc:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmovx.f16 s4, s0
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s2, s0
+; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v2f16_acc:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vldr.16 s2, .LCPI58_0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s2, s0
+; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 1
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI58_0:
+; CHECK-NOFP-NEXT:    .short 0xfc00 @ half -Inf
+entry:
+  %y = load half, half* %yy
+  %z = call fast half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half> %x)
+  %c = fcmp fast ogt half %y, %z
+  %r = select i1 %c, half %y, half %z
+  store half %r, half* %yy
+  ret void
+}
+
 define arm_aapcs_vfpcc void @fmax_v4f16_acc(<4 x half> %x, half* %yy) {
-; CHECK-LABEL: fmax_v4f16_acc:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-NEXT:    vldr.16 s2, .LCPI57_0
-; CHECK-NEXT:    vmaxnm.f16 s0, s4, s0
-; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
-; CHECK-NEXT:    vldr.16 s2, [r0]
-; CHECK-NEXT:    vmaxnm.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 1
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI57_0:
-; CHECK-NEXT:    .short 0xfc00 @ half -Inf
+; CHECK-FP-LABEL: fmax_v4f16_acc:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmovx.f16 s4, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f16 s4, s1, s4
+; CHECK-FP-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s2, s0
+; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v4f16_acc:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT:    vldr.16 s2, .LCPI59_0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s2, s0
+; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT:    bx lr
+; CHECK-NOFP-NEXT:    .p2align 1
+; CHECK-NOFP-NEXT:  @ %bb.1:
+; CHECK-NOFP-NEXT:  .LCPI59_0:
+; CHECK-NOFP-NEXT:    .short 0xfc00 @ half -Inf
 entry:
   %y = load half, half* %yy
   %z = call fast half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half> %x)
@@ -1711,23 +1917,35 @@ entry:
 }
 
 define arm_aapcs_vfpcc void @fmax_v8f16_acc(<8 x half> %x, half* %yy) {
-; CHECK-LABEL: fmax_v8f16_acc:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s2
-; CHECK-NEXT:    vmaxnm.f16 s4, s4, s2
-; CHECK-NEXT:    vldr.16 s2, [r0]
-; CHECK-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-NEXT:    vmaxnm.f16 s4, s4, s3
-; CHECK-NEXT:    vmaxnm.f16 s0, s4, s0
-; CHECK-NEXT:    vmaxnm.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-FP-LABEL: fmax_v8f16_acc:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vrev32.16 q1, q0
+; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
+; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-FP-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s2, s0
+; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmax_v8f16_acc:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s2
+; CHECK-NOFP-NEXT:    vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s2, s0
+; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
   %z = call fast half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half> %x)
@@ -1741,18 +1959,12 @@ define arm_aapcs_vfpcc void @fmax_v16f16_acc(<16 x half> %x, half* %yy) {
 ; CHECK-FP-LABEL: fmax_v16f16_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmovx.f16 s4, s0
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-FP-NEXT:    vmovx.f16 s0, s3
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-FP-NEXT:    vmovx.f16 s6, s2
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s4, s2
+; CHECK-FP-NEXT:    vrev32.16 q1, q0
+; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
+; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
 ; CHECK-FP-NEXT:    vldr.16 s2, [r0]
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s4, s3
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s2, s0
 ; CHECK-FP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-FP-NEXT:    bx lr
@@ -2235,8 +2447,10 @@ declare float @llvm.experimental.vector.reduce.fmin.v2f32(<2 x float>)
 declare float @llvm.experimental.vector.reduce.fmin.v4f32(<4 x float>)
 declare float @llvm.experimental.vector.reduce.fmin.v8f32(<8 x float>)
 declare half @llvm.experimental.vector.reduce.fmax.v16f16(<16 x half>)
+declare half @llvm.experimental.vector.reduce.fmax.v2f16(<2 x half>)
 declare half @llvm.experimental.vector.reduce.fmax.v4f16(<4 x half>)
 declare half @llvm.experimental.vector.reduce.fmax.v8f16(<8 x half>)
 declare half @llvm.experimental.vector.reduce.fmin.v16f16(<16 x half>)
+declare half @llvm.experimental.vector.reduce.fmin.v2f16(<2 x half>)
 declare half @llvm.experimental.vector.reduce.fmin.v4f16(<4 x half>)
 declare half @llvm.experimental.vector.reduce.fmin.v8f16(<8 x half>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
index e4ac03e9b3c8..89d1546932a4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
@@ -14,13 +14,21 @@ entry:
 }
 
 define arm_aapcs_vfpcc float @fmul_v4f32(<4 x float> %x, float %y) {
-; CHECK-LABEL: fmul_v4f32:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f32 s6, s0, s1
-; CHECK-NEXT:    vmul.f32 s6, s6, s2
-; CHECK-NEXT:    vmul.f32 s0, s6, s3
-; CHECK-NEXT:    vmul.f32 s0, s4, s0
-; CHECK-NEXT:    bx lr
+; CHECK-FP-LABEL: fmul_v4f32:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmul.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vmul.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vmul.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vmul.f32 s0, s4, s0
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmul_v4f32:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmul.f32 s6, s0, s1
+; CHECK-NOFP-NEXT:    vmul.f32 s6, s6, s2
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s4, s0
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float %y, <4 x float> %x)
   ret float %z
@@ -30,9 +38,9 @@ define arm_aapcs_vfpcc float @fmul_v8f32(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmul_v8f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmul.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmul.f32 s4, s0, s1
-; CHECK-FP-NEXT:    vmul.f32 s4, s4, s2
-; CHECK-FP-NEXT:    vmul.f32 s0, s4, s3
+; CHECK-FP-NEXT:    vmul.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmul.f32 s0, s0, s1
+; CHECK-FP-NEXT:    vmul.f32 s0, s0, s4
 ; CHECK-FP-NEXT:    vmul.f32 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
@@ -52,18 +60,46 @@ entry:
   ret float %z
 }
 
-define arm_aapcs_vfpcc void @fmul_v4f16(<4 x half> %x, half* %yy) {
-; CHECK-LABEL: fmul_v4f16:
+define arm_aapcs_vfpcc void @fmul_v2f16(<2 x half> %x, half* %yy) {
+; CHECK-LABEL: fmul_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmul.f16 s4, s0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vmul.f16 s4, s4, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s4
 ; CHECK-NEXT:    vldr.16 s2, [r0]
-; CHECK-NEXT:    vmul.f16 s0, s4, s0
 ; CHECK-NEXT:    vmul.f16 s0, s2, s0
 ; CHECK-NEXT:    vstr.16 s0, [r0]
 ; CHECK-NEXT:    bx lr
+entry:
+  %y = load half, half* %yy
+  %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half %y, <2 x half> %x)
+  store half %z, half* %yy
+  ret void
+}
+
+define arm_aapcs_vfpcc void @fmul_v4f16(<4 x half> %x, half* %yy) {
+; CHECK-FP-LABEL: fmul_v4f16:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vmovx.f16 s4, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vmul.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmul.f16 s4, s1, s4
+; CHECK-FP-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP-NEXT:    vmul.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmul.f16 s0, s2, s0
+; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmul_v4f16:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s4, s1
+; CHECK-NOFP-NEXT:    vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s2, s0
+; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
   %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half %y, <4 x half> %x)
@@ -72,23 +108,35 @@ entry:
 }
 
 define arm_aapcs_vfpcc void @fmul_v8f16(<8 x half> %x, half* %yy) {
-; CHECK-LABEL: fmul_v8f16:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vmul.f16 s4, s0, s4
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmul.f16 s4, s4, s1
-; CHECK-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s2
-; CHECK-NEXT:    vmul.f16 s4, s4, s2
-; CHECK-NEXT:    vldr.16 s2, [r0]
-; CHECK-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-NEXT:    vmul.f16 s4, s4, s3
-; CHECK-NEXT:    vmul.f16 s0, s4, s0
-; CHECK-NEXT:    vmul.f16 s0, s2, s0
-; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    bx lr
+; CHECK-FP-LABEL: fmul_v8f16:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    vrev32.16 q1, q0
+; CHECK-FP-NEXT:    vmul.f16 q0, q0, q1
+; CHECK-FP-NEXT:    vmul.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmul.f16 s0, s0, s1
+; CHECK-FP-NEXT:    vldr.16 s2, [r0]
+; CHECK-FP-NEXT:    vmul.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmul.f16 s0, s2, s0
+; CHECK-FP-NEXT:    vstr.16 s0, [r0]
+; CHECK-FP-NEXT:    bx lr
+;
+; CHECK-NOFP-LABEL: fmul_v8f16:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s4, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s4, s2
+; CHECK-NOFP-NEXT:    vldr.16 s2, [r0]
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s4, s6
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s4, s3
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s2, s0
+; CHECK-NOFP-NEXT:    vstr.16 s0, [r0]
+; CHECK-NOFP-NEXT:    bx lr
 entry:
   %y = load half, half* %yy
   %z = call fast half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half %y, <8 x half> %x)
@@ -100,18 +148,12 @@ define arm_aapcs_vfpcc void @fmul_v16f16(<16 x half> %x, half* %yy) {
 ; CHECK-FP-LABEL: fmul_v16f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmul.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmovx.f16 s4, s0
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmul.f16 s4, s0, s4
-; CHECK-FP-NEXT:    vmovx.f16 s0, s3
-; CHECK-FP-NEXT:    vmul.f16 s4, s4, s1
-; CHECK-FP-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-FP-NEXT:    vmovx.f16 s6, s2
-; CHECK-FP-NEXT:    vmul.f16 s4, s4, s2
+; CHECK-FP-NEXT:    vrev32.16 q1, q0
+; CHECK-FP-NEXT:    vmul.f16 q0, q0, q1
+; CHECK-FP-NEXT:    vmul.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmul.f16 s0, s0, s1
 ; CHECK-FP-NEXT:    vldr.16 s2, [r0]
-; CHECK-FP-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-FP-NEXT:    vmul.f16 s4, s4, s3
-; CHECK-FP-NEXT:    vmul.f16 s0, s4, s0
+; CHECK-FP-NEXT:    vmul.f16 s0, s0, s4
 ; CHECK-FP-NEXT:    vmul.f16 s0, s2, s0
 ; CHECK-FP-NEXT:    vstr.16 s0, [r0]
 ; CHECK-FP-NEXT:    bx lr
@@ -227,6 +269,22 @@ entry:
   ret float %z
 }
 
+define arm_aapcs_vfpcc void @fmul_v2f16_nofast(<2 x half> %x, half* %yy) {
+; CHECK-LABEL: fmul_v2f16_nofast:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldr.16 s4, [r0]
+; CHECK-NEXT:    vmul.f16 s4, s4, s0
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmul.f16 s0, s4, s0
+; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %y = load half, half* %yy
+  %z = call half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half %y, <2 x half> %x)
+  store half %z, half* %yy
+  ret void
+}
+
 define arm_aapcs_vfpcc void @fmul_v4f16_nofast(<4 x half> %x, half* %yy) {
 ; CHECK-LABEL: fmul_v4f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
@@ -349,5 +407,6 @@ declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v2f32(float, <2 x flo
 declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float, <4 x float>)
 declare float @llvm.experimental.vector.reduce.v2.fmul.f32.v8f32(float, <8 x float>)
 declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v16f16(half, <16 x half>)
+declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v2f16(half, <2 x half>)
 declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v4f16(half, <4 x half>)
 declare half @llvm.experimental.vector.reduce.v2.fmul.f16.v8f16(half, <8 x half>)

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
index c817f73ff817..29e441e3e90c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -122,14 +122,14 @@ define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB1_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
-; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r2, s3
 ; CHECK-NEXT:    cmp r12, r1
-; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    mul r2, r3, r2
 ; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    mul lr, r3, r2
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    mul r2, r3, r2
-; CHECK-NEXT:    vmov r3, s3
-; CHECK-NEXT:    mul r2, r3, r2
+; CHECK-NEXT:    mul r2, r2, lr
 ; CHECK-NEXT:    beq .LBB1_8
 ; CHECK-NEXT:  .LBB1_6: @ %for.body.preheader1
 ; CHECK-NEXT:    sub.w lr, r1, r12
@@ -222,13 +222,13 @@ define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vand q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB2_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vmov r12, s1
+; CHECK-NEXT:    vmov r12, s3
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    and.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov lr, s1
 ; CHECK-NEXT:    and.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    and.w r2, r2, lr
 ; CHECK-NEXT:    and.w r2, r2, r12
 ; CHECK-NEXT:    beq .LBB2_9
 ; CHECK-NEXT:  .LBB2_7: @ %for.body.preheader1
@@ -322,13 +322,13 @@ define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vorr q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB3_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vmov r12, s1
+; CHECK-NEXT:    vmov r12, s3
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    orr.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov lr, s1
 ; CHECK-NEXT:    orr.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    orr.w r2, r2, lr
 ; CHECK-NEXT:    orr.w r2, r2, r12
 ; CHECK-NEXT:    beq .LBB3_9
 ; CHECK-NEXT:  .LBB3_7: @ %for.body.preheader1
@@ -422,13 +422,13 @@ define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    veor q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB4_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vmov r12, s1
+; CHECK-NEXT:    vmov r12, s3
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    eor.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov lr, s1
 ; CHECK-NEXT:    eor.w r12, r12, r2
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    eor.w r2, r2, lr
 ; CHECK-NEXT:    eor.w r2, r2, r12
 ; CHECK-NEXT:    beq .LBB4_9
 ; CHECK-NEXT:  .LBB4_7: @ %for.body.preheader1
@@ -522,10 +522,10 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vadd.f32 q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB5_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vadd.f32 s4, s0, s1
+; CHECK-NEXT:    vadd.f32 s4, s2, s3
 ; CHECK-NEXT:    cmp r2, r1
-; CHECK-NEXT:    vadd.f32 s4, s4, s2
-; CHECK-NEXT:    vadd.f32 s0, s4, s3
+; CHECK-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-NEXT:    vadd.f32 s0, s0, s4
 ; CHECK-NEXT:    beq .LBB5_9
 ; CHECK-NEXT:  .LBB5_7: @ %for.body.preheader1
 ; CHECK-NEXT:    sub.w lr, r1, r2
@@ -623,10 +623,10 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vmul.f32 q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB6_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vmul.f32 s4, s0, s1
+; CHECK-NEXT:    vmul.f32 s4, s2, s3
 ; CHECK-NEXT:    cmp r2, r1
-; CHECK-NEXT:    vmul.f32 s4, s4, s2
-; CHECK-NEXT:    vmul.f32 s0, s4, s3
+; CHECK-NEXT:    vmul.f32 s0, s0, s1
+; CHECK-NEXT:    vmul.f32 s0, s0, s4
 ; CHECK-NEXT:    beq .LBB6_9
 ; CHECK-NEXT:  .LBB6_7: @ %for.body.preheader1
 ; CHECK-NEXT:    sub.w lr, r1, r2

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
index 15ae58de9357..7510169fbbd5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mul.ll
@@ -16,12 +16,12 @@ entry:
 define arm_aapcs_vfpcc i32 @mul_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: mul_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    muls r0, r1, r0
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -33,12 +33,12 @@ define arm_aapcs_vfpcc i32 @mul_v8i32(<8 x i32> %x) {
 ; CHECK-LABEL: mul_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.i32 q0, q0, q1
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    muls r0, r1, r0
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -49,12 +49,12 @@ entry:
 define arm_aapcs_vfpcc i16 @mul_v4i16(<4 x i16> %x) {
 ; CHECK-LABEL: mul_v4i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r0, s1
-; CHECK-NEXT:    vmov r1, s0
-; CHECK-NEXT:    muls r0, r1, r0
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -65,20 +65,14 @@ entry:
 define arm_aapcs_vfpcc i16 @mul_v8i16(<8 x i16> %x) {
 ; CHECK-LABEL: mul_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    muls r0, r1, r0
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vmul.i16 q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -90,20 +84,14 @@ define arm_aapcs_vfpcc i16 @mul_v16i16(<16 x i16> %x) {
 ; CHECK-LABEL: mul_v16i16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.i16 q0, q0, q1
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    muls r0, r1, r0
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vmul.i16 q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -114,20 +102,14 @@ entry:
 define arm_aapcs_vfpcc i8 @mul_v8i8(<8 x i8> %x) {
 ; CHECK-LABEL: mul_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vmov.u16 r1, q0[0]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[2]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[3]
-; CHECK-NEXT:    muls r0, r1, r0
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vmul.i16 q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
 ; CHECK-NEXT:    vmov.u16 r1, q0[4]
 ; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[5]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[6]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.u16 r2, q0[0]
+; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -138,36 +120,16 @@ entry:
 define arm_aapcs_vfpcc i8 @mul_v16i8(<16 x i8> %x) {
 ; CHECK-LABEL: mul_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    muls r0, r1, r0
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vmul.i8 q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    vmul.i8 q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
 ; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -179,36 +141,16 @@ define arm_aapcs_vfpcc i8 @mul_v32i8(<32 x i8> %x) {
 ; CHECK-LABEL: mul_v32i8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.i8 q0, q0, q1
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.u8 r1, q0[0]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[2]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[3]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[4]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[5]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[6]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[7]
-; CHECK-NEXT:    muls r0, r1, r0
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vmul.i8 q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    vmul.i8 q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r0, q0[12]
 ; CHECK-NEXT:    vmov.u8 r1, q0[8]
 ; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[9]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[10]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[11]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[12]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[13]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[14]
-; CHECK-NEXT:    muls r0, r1, r0
-; CHECK-NEXT:    vmov.u8 r1, q0[15]
+; CHECK-NEXT:    vmov.u8 r1, q0[4]
+; CHECK-NEXT:    vmov.u8 r2, q0[0]
+; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -286,12 +228,12 @@ entry:
 define arm_aapcs_vfpcc i32 @mul_v4i32_acc(<4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: mul_v4i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    muls r1, r2, r1
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    muls r2, r3, r2
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
@@ -305,12 +247,12 @@ define arm_aapcs_vfpcc i32 @mul_v8i32_acc(<8 x i32> %x, i32 %y) {
 ; CHECK-LABEL: mul_v8i32_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.i32 q0, q0, q1
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    muls r1, r2, r1
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    muls r2, r3, r2
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
@@ -323,12 +265,12 @@ entry:
 define arm_aapcs_vfpcc i16 @mul_v4i16_acc(<4 x i16> %x, i16 %y) {
 ; CHECK-LABEL: mul_v4i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov r1, s1
-; CHECK-NEXT:    vmov r2, s0
-; CHECK-NEXT:    muls r1, r2, r1
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    muls r2, r3, r2
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
@@ -341,20 +283,14 @@ entry:
 define arm_aapcs_vfpcc i16 @mul_v8i16_acc(<8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: mul_v8i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    muls r1, r2, r1
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vmul.i16 q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    muls r2, r3, r2
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
@@ -368,20 +304,14 @@ define arm_aapcs_vfpcc i16 @mul_v16i16_acc(<16 x i16> %x, i16 %y) {
 ; CHECK-LABEL: mul_v16i16_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.i16 q0, q0, q1
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    muls r1, r2, r1
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vmul.i16 q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    muls r2, r3, r2
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
@@ -394,20 +324,14 @@ entry:
 define arm_aapcs_vfpcc i8 @mul_v8i8_acc(<8 x i8> %x, i8 %y) {
 ; CHECK-LABEL: mul_v8i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u16 r1, q0[1]
-; CHECK-NEXT:    vmov.u16 r2, q0[0]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[2]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[3]
-; CHECK-NEXT:    muls r1, r2, r1
+; CHECK-NEXT:    vrev32.16 q1, q0
+; CHECK-NEXT:    vmul.i16 q0, q0, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
 ; CHECK-NEXT:    vmov.u16 r2, q0[4]
 ; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[5]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[6]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u16 r2, q0[7]
+; CHECK-NEXT:    vmov.u16 r2, q0[2]
+; CHECK-NEXT:    vmov.u16 r3, q0[0]
+; CHECK-NEXT:    muls r2, r3, r2
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
@@ -420,36 +344,16 @@ entry:
 define arm_aapcs_vfpcc i8 @mul_v16i8_acc(<16 x i8> %x, i8 %y) {
 ; CHECK-LABEL: mul_v16i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    muls r1, r2, r1
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vmul.i8 q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    vmul.i8 q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    vmov.u8 r2, q0[8]
 ; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
+; CHECK-NEXT:    muls r2, r3, r2
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr
@@ -463,36 +367,16 @@ define arm_aapcs_vfpcc i8 @mul_v32i8_acc(<32 x i8> %x, i8 %y) {
 ; CHECK-LABEL: mul_v32i8_acc:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.i8 q0, q0, q1
-; CHECK-NEXT:    vmov.u8 r1, q0[1]
-; CHECK-NEXT:    vmov.u8 r2, q0[0]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[2]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[3]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[5]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[6]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
-; CHECK-NEXT:    muls r1, r2, r1
+; CHECK-NEXT:    vrev16.8 q1, q0
+; CHECK-NEXT:    vmul.i8 q0, q0, q1
+; CHECK-NEXT:    vrev32.8 q1, q0
+; CHECK-NEXT:    vmul.i8 q0, q0, q1
+; CHECK-NEXT:    vmov.u8 r1, q0[12]
 ; CHECK-NEXT:    vmov.u8 r2, q0[8]
 ; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[9]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[11]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[12]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[14]
-; CHECK-NEXT:    muls r1, r2, r1
-; CHECK-NEXT:    vmov.u8 r2, q0[15]
+; CHECK-NEXT:    vmov.u8 r2, q0[4]
+; CHECK-NEXT:    vmov.u8 r3, q0[0]
+; CHECK-NEXT:    muls r2, r3, r2
 ; CHECK-NEXT:    muls r1, r2, r1
 ; CHECK-NEXT:    muls r0, r1, r0
 ; CHECK-NEXT:    bx lr


        


More information about the llvm-commits mailing list