[llvm] f2412d3 - [SVE][CodeGen] Lower scalable integer vector reductions

Wed Nov 4 03:40:01 PST 2020

Author: Kerry McLaughlin
Date: 2020-11-04T11:38:49Z
New Revision: f2412d372d93b8c9f6e08fd2166e1e161ba4e6f8

URL: https://github.com/llvm/llvm-project/commit/f2412d372d93b8c9f6e08fd2166e1e161ba4e6f8
DIFF: https://github.com/llvm/llvm-project/commit/f2412d372d93b8c9f6e08fd2166e1e161ba4e6f8.diff

LOG: [SVE][CodeGen] Lower scalable integer vector reductions

This patch uses the existing LowerFixedLengthReductionToSVE function to also lower
scalable vector reductions. A separate function has been added to lower VECREDUCE_AND
& VECREDUCE_OR operations with predicate types using ptest.

Lowering scalable floating-point reductions will be addressed in a follow up patch,
for now these will hit the assertion added to expandVecReduce() in TargetLowering.

Reviewed By: paulwalker-arm

Differential Revision: https://reviews.llvm.org/D89382

Added: 
    llvm/test/CodeGen/AArch64/sve-int-pred-reduce.ll
    llvm/test/CodeGen/AArch64/sve-int-reduce.ll
    llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll
    llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a78e80b5c6eb..fbb5262ae905 100644

--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20857,7 +20857,7 @@ SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
   unsigned Opcode = N->getOpcode();
 
   // VECREDUCE over 1-element vector is just an extract.
-  if (VT.getVectorNumElements() == 1) {
+  if (VT.getVectorElementCount().isScalar()) {
     SDLoc dl(N);
     SDValue Res =
         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0,

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 8a2f1eaf9ce2..4180fbf45221 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3323,6 +3323,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     SDValue InVec = Op.getOperand(0);
     SDValue EltNo = Op.getOperand(1);
     EVT VecVT = InVec.getValueType();
+    // computeKnownBits not yet implemented for scalable vectors.
+    if (VecVT.isScalableVector())
+      break;
     const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
     const unsigned NumSrcElts = VecVT.getVectorNumElements();
 
@@ -4809,6 +4812,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::VSCALE:
     assert(VT == Operand.getValueType() && "Unexpected VT!");
     break;
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+    if (Operand.getValueType().getScalarType() == MVT::i1)
+      return getNode(ISD::VECREDUCE_OR, DL, VT, Operand);
+    break;
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_UMIN:
+    if (Operand.getValueType().getScalarType() == MVT::i1)
+      return getNode(ISD::VECREDUCE_AND, DL, VT, Operand);
+    break;
   }
 
   SDNode *N;
@@ -5318,10 +5331,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   case ISD::MULHS:
   case ISD::SDIV:
   case ISD::SREM:
-  case ISD::SMIN:
-  case ISD::SMAX:
-  case ISD::UMIN:
-  case ISD::UMAX:
   case ISD::SADDSAT:
   case ISD::SSUBSAT:
   case ISD::UADDSAT:
@@ -5330,6 +5339,22 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
     break;
+  case ISD::SMIN:
+  case ISD::UMAX:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+      return getNode(ISD::OR, DL, VT, N1, N2);
+    break;
+  case ISD::SMAX:
+  case ISD::UMIN:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    if (VT.isVector() && VT.getVectorElementType() == MVT::i1)
+      return getNode(ISD::AND, DL, VT, N1, N2);
+    break;
   case ISD::FADD:
   case ISD::FSUB:
   case ISD::FMUL:

diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 58703c4f3d99..7ab1fedaf094 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8000,6 +8000,10 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
   SDValue Op = Node->getOperand(0);
   EVT VT = Op.getValueType();
 
+  if (VT.isScalableVector())
+    report_fatal_error(
+        "Expanding reductions for scalable vectors is undefined.");
+
   // Try to use a shuffle reduction for power of two vectors.
   if (VT.isPow2VectorType()) {
     while (VT.getVectorNumElements() > 1) {

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 89713be01c55..35d03f8eb10d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1013,6 +1013,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SHL, VT, Custom);
       setOperationAction(ISD::SRL, VT, Custom);
       setOperationAction(ISD::SRA, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
     }
 
     // Illegal unpacked integer vector types.
@@ -1027,6 +1035,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SETCC, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
       setOperationAction(ISD::TRUNCATE, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
 
       // There are no legal MVT::nxv16f## based types.
       if (VT != MVT::nxv16i1) {
@@ -9815,30 +9826,35 @@ SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
                       Op.getOpcode() == ISD::VECREDUCE_FADD ||
                       (Op.getOpcode() != ISD::VECREDUCE_ADD &&
                        SrcVT.getVectorElementType() == MVT::i64);
-  if (useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) {
+  if (SrcVT.isScalableVector() ||
+      useSVEForFixedLengthVectorVT(SrcVT, OverrideNEON)) {
+
+    if (SrcVT.getVectorElementType() == MVT::i1)
+      return LowerPredReductionToSVE(Op, DAG);
+
     switch (Op.getOpcode()) {
     case ISD::VECREDUCE_ADD:
-      return LowerFixedLengthReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
+      return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
     case ISD::VECREDUCE_AND:
-      return LowerFixedLengthReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
+      return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
     case ISD::VECREDUCE_OR:
-      return LowerFixedLengthReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
+      return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
     case ISD::VECREDUCE_SMAX:
-      return LowerFixedLengthReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
+      return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
     case ISD::VECREDUCE_SMIN:
-      return LowerFixedLengthReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
+      return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
     case ISD::VECREDUCE_UMAX:
-      return LowerFixedLengthReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
+      return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
     case ISD::VECREDUCE_UMIN:
-      return LowerFixedLengthReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
+      return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
     case ISD::VECREDUCE_XOR:
-      return LowerFixedLengthReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
+      return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
     case ISD::VECREDUCE_FADD:
-      return LowerFixedLengthReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
+      return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
     case ISD::VECREDUCE_FMAX:
-      return LowerFixedLengthReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
+      return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
     case ISD::VECREDUCE_FMIN:
-      return LowerFixedLengthReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
+      return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
     default:
       llvm_unreachable("Unhandled fixed length reduction");
     }
@@ -16333,20 +16349,56 @@ SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
 }
 
-SDValue AArch64TargetLowering::LowerFixedLengthReductionToSVE(unsigned Opcode,
-    SDValue ScalarOp, SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
+                                                       SelectionDAG &DAG) const {
+  SDLoc DL(ReduceOp);
+  SDValue Op = ReduceOp.getOperand(0);
+  EVT OpVT = Op.getValueType();
+  EVT VT = ReduceOp.getValueType();
+
+  if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
+    return SDValue();
+
+  SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
+
+  switch (ReduceOp.getOpcode()) {
+  default:
+    return SDValue();
+  case ISD::VECREDUCE_OR:
+    return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
+  case ISD::VECREDUCE_AND: {
+    Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
+    return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
+  }
+  case ISD::VECREDUCE_XOR: {
+    SDValue ID =
+        DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
+    SDValue Cntp =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
+    return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
+  }
+  }
+
+  return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
+                                                   SDValue ScalarOp,
+                                                   SelectionDAG &DAG) const {
   SDLoc DL(ScalarOp);
   SDValue VecOp = ScalarOp.getOperand(0);
   EVT SrcVT = VecOp.getValueType();
 
-  SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
-  EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
-  VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
+  if (useSVEForFixedLengthVectorVT(SrcVT, true)) {
+    EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
+    VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
+  }
 
   // UADDV always returns an i64 result.
   EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
                                                    SrcVT.getVectorElementType();
 
+  SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
   SDValue Rdx = DAG.getNode(Opcode, DL, getPackedSVEVectorVT(ResVT), Pg, VecOp);
   SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
                             Rdx, DAG.getConstant(0, DL, MVT::i64));

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 047dffb85285..0b3365b7f25d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -933,8 +933,9 @@ class AArch64TargetLowering : public TargetLowering {
                                                SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp, SelectionDAG &DAG) const;
-  SDValue LowerFixedLengthReductionToSVE(unsigned Opcode, SDValue ScalarOp,
-                                         SelectionDAG &DAG) const;
+  SDValue LowerPredReductionToSVE(SDValue ScalarOp, SelectionDAG &DAG) const;
+  SDValue LowerReductionToSVE(unsigned Opcode, SDValue ScalarOp,
+                              SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVectorSelectToSVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVectorSetccToSVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/test/CodeGen/AArch64/sve-int-pred-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-pred-reduce.ll
new file mode 100644
index 000000000000..a0d956b3115f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-int-pred-reduce.ll
@@ -0,0 +1,375 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; ANDV
+
+define i1 @reduce_and_nxv16i1(<vscale x 16 x i1> %vec) {
+; CHECK-LABEL: reduce_and_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    not p0.b, p1/z, p0.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.and.i1.nxv16i1(<vscale x 16 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_and_nxv8i1(<vscale x 8 x i1> %vec) {
+; CHECK-LABEL: reduce_and_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    not p0.b, p1/z, p0.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.and.i1.nxv8i1(<vscale x 8 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_and_nxv4i1(<vscale x 4 x i1> %vec) {
+; CHECK-LABEL: reduce_and_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    not p0.b, p1/z, p0.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.and.i1.nxv4i1(<vscale x 4 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_and_nxv2i1(<vscale x 2 x i1> %vec) {
+; CHECK-LABEL: reduce_and_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    not p0.b, p1/z, p0.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.and.i1.nxv2i1(<vscale x 2 x i1> %vec)
+  ret i1 %res
+}
+
+; ORV
+
+define i1 @reduce_or_nxv16i1(<vscale x 16 x i1> %vec) {
+; CHECK-LABEL: reduce_or_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.or.i1.nxv16i1(<vscale x 16 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_or_nxv8i1(<vscale x 8 x i1> %vec) {
+; CHECK-LABEL: reduce_or_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.or.i1.nxv8i1(<vscale x 8 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_or_nxv4i1(<vscale x 4 x i1> %vec) {
+; CHECK-LABEL: reduce_or_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.or.i1.nxv4i1(<vscale x 4 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_or_nxv2i1(<vscale x 2 x i1> %vec) {
+; CHECK-LABEL: reduce_or_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.or.i1.nxv2i1(<vscale x 2 x i1> %vec)
+  ret i1 %res
+}
+
+; XORV
+
+define i1 @reduce_xor_nxv16i1(<vscale x 16 x i1> %vec) {
+; CHECK-LABEL: reduce_xor_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cntp x8, p1, p0.b
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.xor.i1.nxv16i1(<vscale x 16 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_xor_nxv8i1(<vscale x 8 x i1> %vec) {
+; CHECK-LABEL: reduce_xor_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    cntp x8, p1, p0.h
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.xor.i1.nxv8i1(<vscale x 8 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_xor_nxv4i1(<vscale x 4 x i1> %vec) {
+; CHECK-LABEL: reduce_xor_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    cntp x8, p1, p0.s
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.xor.i1.nxv4i1(<vscale x 4 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_xor_nxv2i1(<vscale x 2 x i1> %vec) {
+; CHECK-LABEL: reduce_xor_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    cntp x8, p1, p0.d
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.xor.i1.nxv2i1(<vscale x 2 x i1> %vec)
+  ret i1 %res
+}
+
+; SMAXV
+
+define i1 @reduce_smax_nxv16i1(<vscale x 16 x i1> %vec) {
+; CHECK-LABEL: reduce_smax_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    not p0.b, p1/z, p0.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.smax.i1.nxv16i1(<vscale x 16 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_smax_nxv8i1(<vscale x 8 x i1> %vec) {
+; CHECK-LABEL: reduce_smax_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    not p0.b, p1/z, p0.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.smax.i1.nxv8i1(<vscale x 8 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_smax_nxv4i1(<vscale x 4 x i1> %vec) {
+; CHECK-LABEL: reduce_smax_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    not p0.b, p1/z, p0.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.smax.i1.nxv4i1(<vscale x 4 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_smax_nxv2i1(<vscale x 2 x i1> %vec) {
+; CHECK-LABEL: reduce_smax_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    not p0.b, p1/z, p0.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.smax.i1.nxv2i1(<vscale x 2 x i1> %vec)
+  ret i1 %res
+}
+
+; SMINV
+
+define i1 @reduce_smin_nxv16i1(<vscale x 16 x i1> %vec) {
+; CHECK-LABEL: reduce_smin_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.smin.i1.nxv16i1(<vscale x 16 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_smin_nxv8i1(<vscale x 8 x i1> %vec) {
+; CHECK-LABEL: reduce_smin_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.smin.i1.nxv8i1(<vscale x 8 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_smin_nxv4i1(<vscale x 4 x i1> %vec) {
+; CHECK-LABEL: reduce_smin_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.smin.i1.nxv4i1(<vscale x 4 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_smin_nxv2i1(<vscale x 2 x i1> %vec) {
+; CHECK-LABEL: reduce_smin_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.smin.i1.nxv2i1(<vscale x 2 x i1> %vec)
+  ret i1 %res
+}
+
+; UMAXV
+
+define i1 @reduce_umax_nxv16i1(<vscale x 16 x i1> %vec) {
+; CHECK-LABEL: reduce_umax_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.umax.i1.nxv16i1(<vscale x 16 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_umax_nxv8i1(<vscale x 8 x i1> %vec) {
+; CHECK-LABEL: reduce_umax_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.umax.i1.nxv8i1(<vscale x 8 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_umax_nxv4i1(<vscale x 4 x i1> %vec) {
+; CHECK-LABEL: reduce_umax_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.umax.i1.nxv4i1(<vscale x 4 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_umax_nxv2i1(<vscale x 2 x i1> %vec) {
+; CHECK-LABEL: reduce_umax_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.umax.i1.nxv2i1(<vscale x 2 x i1> %vec)
+  ret i1 %res
+}
+
+; UMINV
+
+define i1 @reduce_umin_nxv16i1(<vscale x 16 x i1> %vec) {
+; CHECK-LABEL: reduce_umin_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    not p0.b, p1/z, p0.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.umin.i1.nxv16i1(<vscale x 16 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_umin_nxv8i1(<vscale x 8 x i1> %vec) {
+; CHECK-LABEL: reduce_umin_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    not p0.b, p1/z, p0.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.umin.i1.nxv8i1(<vscale x 8 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_umin_nxv4i1(<vscale x 4 x i1> %vec) {
+; CHECK-LABEL: reduce_umin_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    not p0.b, p1/z, p0.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.umin.i1.nxv4i1(<vscale x 4 x i1> %vec)
+  ret i1 %res
+}
+
+define i1 @reduce_umin_nxv2i1(<vscale x 2 x i1> %vec) {
+; CHECK-LABEL: reduce_umin_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    not p0.b, p1/z, p0.b
+; CHECK-NEXT:    ptest p1, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.umin.i1.nxv2i1(<vscale x 2 x i1> %vec)
+  ret i1 %res
+}
+
+declare i1 @llvm.vector.reduce.and.i1.nxv16i1(<vscale x 16 x i1> %vec)
+declare i1 @llvm.vector.reduce.and.i1.nxv8i1(<vscale x 8 x i1> %vec)
+declare i1 @llvm.vector.reduce.and.i1.nxv4i1(<vscale x 4 x i1> %vec)
+declare i1 @llvm.vector.reduce.and.i1.nxv2i1(<vscale x 2 x i1> %vec)
+
+declare i1 @llvm.vector.reduce.or.i1.nxv16i1(<vscale x 16 x i1> %vec)
+declare i1 @llvm.vector.reduce.or.i1.nxv8i1(<vscale x 8 x i1> %vec)
+declare i1 @llvm.vector.reduce.or.i1.nxv4i1(<vscale x 4 x i1> %vec)
+declare i1 @llvm.vector.reduce.or.i1.nxv2i1(<vscale x 2 x i1> %vec)
+
+declare i1 @llvm.vector.reduce.xor.i1.nxv16i1(<vscale x 16 x i1> %vec)
+declare i1 @llvm.vector.reduce.xor.i1.nxv8i1(<vscale x 8 x i1> %vec)
+declare i1 @llvm.vector.reduce.xor.i1.nxv4i1(<vscale x 4 x i1> %vec)
+declare i1 @llvm.vector.reduce.xor.i1.nxv2i1(<vscale x 2 x i1> %vec)
+
+declare i1 @llvm.vector.reduce.smin.i1.nxv16i1(<vscale x 16 x i1> %vec)
+declare i1 @llvm.vector.reduce.smin.i1.nxv8i1(<vscale x 8 x i1> %vec)
+declare i1 @llvm.vector.reduce.smin.i1.nxv4i1(<vscale x 4 x i1> %vec)
+declare i1 @llvm.vector.reduce.smin.i1.nxv2i1(<vscale x 2 x i1> %vec)
+
+declare i1 @llvm.vector.reduce.smax.i1.nxv16i1(<vscale x 16 x i1> %vec)
+declare i1 @llvm.vector.reduce.smax.i1.nxv8i1(<vscale x 8 x i1> %vec)
+declare i1 @llvm.vector.reduce.smax.i1.nxv4i1(<vscale x 4 x i1> %vec)
+declare i1 @llvm.vector.reduce.smax.i1.nxv2i1(<vscale x 2 x i1> %vec)
+
+declare i1 @llvm.vector.reduce.umin.i1.nxv16i1(<vscale x 16 x i1> %vec)
+declare i1 @llvm.vector.reduce.umin.i1.nxv8i1(<vscale x 8 x i1> %vec)
+declare i1 @llvm.vector.reduce.umin.i1.nxv4i1(<vscale x 4 x i1> %vec)
+declare i1 @llvm.vector.reduce.umin.i1.nxv2i1(<vscale x 2 x i1> %vec)
+
+declare i1 @llvm.vector.reduce.umax.i1.nxv16i1(<vscale x 16 x i1> %vec)
+declare i1 @llvm.vector.reduce.umax.i1.nxv8i1(<vscale x 8 x i1> %vec)
+declare i1 @llvm.vector.reduce.umax.i1.nxv4i1(<vscale x 4 x i1> %vec)
+declare i1 @llvm.vector.reduce.umax.i1.nxv2i1(<vscale x 2 x i1> %vec)

diff  --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
new file mode 100644
index 000000000000..1dc2dcc8d112
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -0,0 +1,417 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; ANDV
+
+define i8 @andv_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: andv_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    andv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.vector.reduce.and.nxv16i8(<vscale x 16 x i8> %a)
+  ret i8 %res
+}
+
+define i16 @andv_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: andv_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    andv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.and.nxv8i16(<vscale x 8 x i16> %a)
+  ret i16 %res
+}
+
+define i32 @andv_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: andv_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    andv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %a)
+  ret i32 %res
+}
+
+define i64 @andv_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: andv_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    andv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> %a)
+  ret i64 %res
+}
+
+; ORV
+
+define i8 @orv_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: orv_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    orv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.vector.reduce.or.nxv16i8(<vscale x 16 x i8> %a)
+  ret i8 %res
+}
+
+define i16 @orv_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: orv_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    orv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.or.nxv8i16(<vscale x 8 x i16> %a)
+  ret i16 %res
+}
+
+define i32 @orv_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: orv_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    orv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %a)
+  ret i32 %res
+}
+
+define i64 @orv_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: orv_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    orv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> %a)
+  ret i64 %res
+}
+
+; XORV
+
+define i8 @xorv_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: xorv_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    eorv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.vector.reduce.xor.nxv16i8(<vscale x 16 x i8> %a)
+  ret i8 %res
+}
+
+define i16 @xorv_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: xorv_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    eorv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.xor.nxv8i16(<vscale x 8 x i16> %a)
+  ret i16 %res
+}
+
+define i32 @xorv_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: xorv_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    eorv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %a)
+  ret i32 %res
+}
+
+define i64 @xorv_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: xorv_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    eorv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.xor.nxv2i64(<vscale x 2 x i64> %a)
+  ret i64 %res
+}
+
+; UADDV
+
+define i8 @uaddv_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: uaddv_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uaddv d0, p0, z0.b
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %a)
+  ret i8 %res
+}
+
+define i16 @uaddv_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: uaddv_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uaddv d0, p0, z0.h
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> %a)
+  ret i16 %res
+}
+
+define i32 @uaddv_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: uaddv_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %a)
+  ret i32 %res
+}
+
+define i64 @uaddv_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: uaddv_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uaddv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %a)
+  ret i64 %res
+}
+
+; UMINV
+
+define i8 @umin_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: umin_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uminv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.vector.reduce.umin.nxv16i8(<vscale x 16 x i8> %a)
+  ret i8 %res
+}
+
+define i16 @umin_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: umin_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uminv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.umin.nxv8i16(<vscale x 8 x i16> %a)
+  ret i16 %res
+}
+
+define i32 @umin_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: umin_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uminv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %a)
+  ret i32 %res
+}
+
+define i64 @umin_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: umin_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uminv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.umin.nxv2i64(<vscale x 2 x i64> %a)
+  ret i64 %res
+}
+
+; SMINV
+
+define i8 @smin_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: smin_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    sminv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.vector.reduce.smin.nxv16i8(<vscale x 16 x i8> %a)
+  ret i8 %res
+}
+
+define i16 @smin_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: smin_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    sminv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.smin.nxv8i16(<vscale x 8 x i16> %a)
+  ret i16 %res
+}
+
+define i32 @smin_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: smin_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sminv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %a)
+  ret i32 %res
+}
+
+define i64 @smin_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: smin_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sminv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.smin.nxv2i64(<vscale x 2 x i64> %a)
+  ret i64 %res
+}
+
+; UMAXV
+
+define i8 @umax_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: umax_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    umaxv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.vector.reduce.umax.nxv16i8(<vscale x 16 x i8> %a)
+  ret i8 %res
+}
+
+define i16 @umax_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: umax_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    umaxv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.umax.nxv8i16(<vscale x 8 x i16> %a)
+  ret i16 %res
+}
+
+define i32 @umax_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: umax_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    umaxv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %a)
+  ret i32 %res
+}
+
+define i64 @umax_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: umax_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    umaxv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.umax.nxv2i64(<vscale x 2 x i64> %a)
+  ret i64 %res
+}
+
+; SMAXV
+
+define i8 @smax_nxv16i8(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: smax_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    smaxv b0, p0, z0.b
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.vector.reduce.smax.nxv16i8(<vscale x 16 x i8> %a)
+  ret i8 %res
+}
+
+define i16 @smax_nxv8i16(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: smax_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    smaxv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16> %a)
+  ret i16 %res
+}
+
+define i32 @smax_nxv4i32(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: smax_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    smaxv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %a)
+  ret i32 %res
+}
+
+define i64 @smax_nxv2i64(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: smax_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    smaxv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.smax.nxv2i64(<vscale x 2 x i64> %a)
+  ret i64 %res
+}
+
+declare i8 @llvm.vector.reduce.and.nxv16i8(<vscale x 16 x i8>)
+declare i16 @llvm.vector.reduce.and.nxv8i16(<vscale x 8 x i16>)
+declare i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64>)
+
+declare i8 @llvm.vector.reduce.or.nxv16i8(<vscale x 16 x i8>)
+declare i16 @llvm.vector.reduce.or.nxv8i16(<vscale x 8 x i16>)
+declare i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64>)
+
+declare i8 @llvm.vector.reduce.xor.nxv16i8(<vscale x 16 x i8>)
+declare i16 @llvm.vector.reduce.xor.nxv8i16(<vscale x 8 x i16>)
+declare i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.xor.nxv2i64(<vscale x 2 x i64>)
+
+declare i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8>)
+declare i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16>)
+declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>)
+
+declare i8 @llvm.vector.reduce.umin.nxv16i8(<vscale x 16 x i8>)
+declare i16 @llvm.vector.reduce.umin.nxv8i16(<vscale x 8 x i16>)
+declare i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.umin.nxv2i64(<vscale x 2 x i64>)
+
+declare i8 @llvm.vector.reduce.smin.nxv16i8(<vscale x 16 x i8>)
+declare i16 @llvm.vector.reduce.smin.nxv8i16(<vscale x 8 x i16>)
+declare i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.smin.nxv2i64(<vscale x 2 x i64>)
+
+declare i8 @llvm.vector.reduce.umax.nxv16i8(<vscale x 16 x i8>)
+declare i16 @llvm.vector.reduce.umax.nxv8i16(<vscale x 8 x i16>)
+declare i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.umax.nxv2i64(<vscale x 2 x i64>)
+
+declare i8 @llvm.vector.reduce.smax.nxv16i8(<vscale x 16 x i8>)
+declare i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16>)
+declare i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32>)
+declare i64 @llvm.vector.reduce.smax.nxv2i64(<vscale x 2 x i64>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll
new file mode 100644
index 000000000000..3d33e8946a9b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-split-int-pred-reduce.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; ANDV
+
+define i1 @andv_nxv32i1(<vscale x 32 x i1> %a) {
+; CHECK-LABEL: andv_nxv32i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p2.b
+; CHECK-NEXT:    and p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    not p0.b, p2/z, p0.b
+; CHECK-NEXT:    ptest p2, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.and.nxv32i1(<vscale x 32 x i1> %a)
+  ret i1 %res
+}
+
+define i1 @andv_nxv64i1(<vscale x 64 x i1> %a) {
+; CHECK-LABEL: andv_nxv64i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ptrue p4.b
+; CHECK-NEXT:    and p1.b, p4/z, p1.b, p3.b
+; CHECK-NEXT:    and p0.b, p4/z, p0.b, p2.b
+; CHECK-NEXT:    and p0.b, p4/z, p0.b, p1.b
+; CHECK-NEXT:    not p0.b, p4/z, p0.b
+; CHECK-NEXT:    ptest p4, p0.b
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.and.nxv64i1(<vscale x 64 x i1> %a)
+  ret i1 %res
+}
+
+; ORV
+
+define i1 @orv_nxv32i1(<vscale x 32 x i1> %a) {
+; CHECK-LABEL: orv_nxv32i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p2.b
+; CHECK-NEXT:    orr p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    ptest p2, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.or.nxv32i1(<vscale x 32 x i1> %a)
+  ret i1 %res
+}
+
+; XORV
+
+define i1 @xorv_nxv32i1(<vscale x 32 x i1> %a) {
+; CHECK-LABEL: xorv_nxv32i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p2.b
+; CHECK-NEXT:    eor p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    cntp x8, p2, p0.b
+; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.xor.nxv32i1(<vscale x 32 x i1> %a)
+  ret i1 %res
+}
+
+; SMAXV
+
+define i1 @smaxv_nxv32i1(<vscale x 32 x i1> %a) {
+; CHECK-LABEL: smaxv_nxv32i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p2.b
+; CHECK-NEXT:    and p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    not p0.b, p2/z, p0.b
+; CHECK-NEXT:    ptest p2, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.smax.nxv32i1(<vscale x 32 x i1> %a)
+  ret i1 %res
+}
+
+; SMINV
+
+define i1 @sminv_nxv32i1(<vscale x 32 x i1> %a) {
+; CHECK-LABEL: sminv_nxv32i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p2.b
+; CHECK-NEXT:    orr p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    ptest p2, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.smin.nxv32i1(<vscale x 32 x i1> %a)
+  ret i1 %res
+}
+
+; UMAXV
+
+define i1 @umaxv_nxv32i1(<vscale x 32 x i1> %a) {
+; CHECK-LABEL: umaxv_nxv32i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p2.b
+; CHECK-NEXT:    orr p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    ptest p2, p0.b
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.umax.nxv32i1(<vscale x 32 x i1> %a)
+  ret i1 %res
+}
+
+; UMINV
+
+define i1 @uminv_nxv32i1(<vscale x 32 x i1> %a) {
+; CHECK-LABEL: uminv_nxv32i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p2.b
+; CHECK-NEXT:    and p0.b, p2/z, p0.b, p1.b
+; CHECK-NEXT:    not p0.b, p2/z, p0.b
+; CHECK-NEXT:    ptest p2, p0.b
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %res = call i1 @llvm.vector.reduce.umin.nxv32i1(<vscale x 32 x i1> %a)
+  ret i1 %res
+}
+
+declare i1 @llvm.vector.reduce.and.nxv32i1(<vscale x 32 x i1>)
+declare i1 @llvm.vector.reduce.and.nxv64i1(<vscale x 64 x i1>)
+
+declare i1 @llvm.vector.reduce.or.nxv32i1(<vscale x 32 x i1>)
+
+declare i1 @llvm.vector.reduce.xor.nxv32i1(<vscale x 32 x i1>)
+
+declare i1 @llvm.vector.reduce.smax.nxv32i1(<vscale x 32 x i1>)
+
+declare i1 @llvm.vector.reduce.smin.nxv32i1(<vscale x 32 x i1>)
+
+declare i1 @llvm.vector.reduce.umax.nxv32i1(<vscale x 32 x i1>)
+
+declare i1 @llvm.vector.reduce.umin.nxv32i1(<vscale x 32 x i1>)

diff  --git a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
new file mode 100644
index 000000000000..06a7dd3e329e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
@@ -0,0 +1,233 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; ANDV
+
+define i8 @andv_nxv8i8(<vscale x 8 x i8> %a) {
+; CHECK-LABEL: andv_nxv8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    andv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.vector.reduce.and.nxv8i8(<vscale x 8 x i8> %a)
+  ret i8 %res
+}
+
+define i32 @andv_nxv8i32(<vscale x 8 x i32> %a) {
+; CHECK-LABEL: andv_nxv8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    andv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> %a)
+  ret i32 %res
+}
+
+; ORV
+
+define i32 @orv_nxv2i32(<vscale x 2 x i32> %a) {
+; CHECK-LABEL: orv_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    orv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> %a)
+  ret i32 %res
+}
+
+define i64 @orv_nxv8i64(<vscale x 8 x i64> %a) {
+; CHECK-LABEL: orv_nxv8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr z1.d, z1.d, z3.d
+; CHECK-NEXT:    orr z0.d, z0.d, z2.d
+; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    orv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.or.nxv8i64(<vscale x 8 x i64> %a)
+  ret i64 %res
+}
+
+; XORV
+
+define i16 @xorv_nxv2i16(<vscale x 2 x i16> %a) {
+; CHECK-LABEL: xorv_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    eorv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.xor.nxv2i16(<vscale x 2 x i16> %a)
+  ret i16 %res
+}
+
+define i32 @xorv_nxv8i32(<vscale x 8 x i32> %a) {
+; CHECK-LABEL: xorv_nxv8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    eorv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> %a)
+  ret i32 %res
+}
+
+; UADDV
+
+define i16 @uaddv_nxv4i16(<vscale x 4 x i16> %a) {
+; CHECK-LABEL: uaddv_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %a)
+  ret i16 %res
+}
+
+define i16 @uaddv_nxv16i16(<vscale x 16 x i16> %a) {
+; CHECK-LABEL: uaddv_nxv16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    uaddv d0, p0, z0.h
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> %a)
+  ret i16 %res
+}
+
+define i32 @uaddv_nxv16i32(<vscale x 16 x i32> %a) {
+; CHECK-LABEL: uaddv_nxv16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add z1.s, z1.s, z3.s
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uaddv d0, p0, z0.s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %a)
+  ret i32 %res
+}
+
+; UMINV
+
+define i32 @umin_nxv2i32(<vscale x 2 x i32> %a) {
+; CHECK-LABEL: umin_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    uminv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.umin.nxv2i32(<vscale x 2 x i32> %a)
+  ret i32 %res
+}
+
+define i64 @umin_nxv4i64(<vscale x 4 x i64> %a) {
+; CHECK-LABEL: umin_nxv4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    uminv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %a)
+  ret i64 %res
+}
+
+; SMINV
+
+define i8 @smin_nxv4i8(<vscale x 4 x i8> %a) {
+; CHECK-LABEL: smin_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    sminv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.vector.reduce.smin.nxv4i8(<vscale x 4 x i8> %a)
+  ret i8 %res
+}
+
+define i32 @smin_nxv8i32(<vscale x 8 x i32> %a) {
+; CHECK-LABEL: smin_nxv8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    sminv s0, p0, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> %a)
+  ret i32 %res
+}
+
+; UMAXV
+
+define i16 @smin_nxv16i16(<vscale x 16 x i16> %a) {
+; CHECK-LABEL: smin_nxv16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    umaxv h0, p0, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.vector.reduce.umax.nxv16i16(<vscale x 16 x i16> %a)
+  ret i16 %res
+}
+
+; SMAXV
+
+define i64 @smin_nxv8i64(<vscale x 8 x i64> %a) {
+; CHECK-LABEL: smin_nxv8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    smax z1.d, p0/m, z1.d, z3.d
+; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z2.d
+; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    smaxv d0, p0, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.vector.reduce.smax.nxv8i64(<vscale x 8 x i64> %a)
+  ret i64 %res
+}
+
+declare i8 @llvm.vector.reduce.and.nxv8i8(<vscale x 8 x i8>)
+declare i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32>)
+
+declare i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32>)
+declare i64 @llvm.vector.reduce.or.nxv8i64(<vscale x 8 x i64>)
+
+declare i16 @llvm.vector.reduce.xor.nxv2i16(<vscale x 2 x i16>)
+declare i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32>)
+
+declare i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16>)
+declare i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16>)
+declare i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32>)
+
+declare i32 @llvm.vector.reduce.umin.nxv2i32(<vscale x 2 x i32>)
+declare i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64>)
+
+declare i8 @llvm.vector.reduce.smin.nxv4i8(<vscale x 4 x i8>)
+declare i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32>)
+
+declare i16 @llvm.vector.reduce.umax.nxv16i16(<vscale x 16 x i16>)
+
+declare i64 @llvm.vector.reduce.smax.nxv8i64(<vscale x 8 x i64>)

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index 811745a1ee41..cd60dd3ab96d 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -142,10 +142,14 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind {
 define i1 @test_v4i1(<4 x i1> %a) nounwind {
 ; CHECK-LABEL: test_v4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.4h, #1
-; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    umaxv h0, v0.4h
-; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w10, v0.h[1]
+; CHECK-NEXT:    umov w11, v0.h[0]
+; CHECK-NEXT:    umov w9, v0.h[2]
+; CHECK-NEXT:    orr w10, w11, w10
+; CHECK-NEXT:    umov w8, v0.h[3]
+; CHECK-NEXT:    orr w9, w10, w9
+; CHECK-NEXT:    orr w8, w9, w8
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %b = call i1 @llvm.vector.reduce.umax.v4i1(<4 x i1> %a)