[llvm] 96bb281 - [AArch64] Prevent unnecessary truncation in bool vector reduce code generation (#120096)

Wed Dec 18 01:14:16 PST 2024

Author: Csanád Hajdú
Date: 2024-12-18T09:14:12Z
New Revision: 96bb281b636a30f5896c48035cca55807f105a56

URL: https://github.com/llvm/llvm-project/commit/96bb281b636a30f5896c48035cca55807f105a56
DIFF: https://github.com/llvm/llvm-project/commit/96bb281b636a30f5896c48035cca55807f105a56.diff

LOG: [AArch64] Prevent unnecessary truncation in bool vector reduce code generation (#120096)

Prevent unnecessarily truncating results of 128 bit wide vector
comparisons to 64 bit wide vector values in boolean vector reduce
operations.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
    llvm/test/CodeGen/AArch64/vecreduce-bool.ll
    llvm/test/CodeGen/AArch64/vector-extract-last-active.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 28f304100326c6..cb6ba06bd4425c 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15928,17 +15928,32 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
       return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
     }
 
-    // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
-    // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
-    // this element size leads to the best codegen, since e.g. setcc results
-    // might need to be truncated otherwise.
-    EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
+    // Results of setcc operations get widened to 128 bits if their input
+    // operands are 128 bits wide, otherwise vectors that are less than 64 bits
+    // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
+    // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
+    // size leads to the best codegen, since e.g. setcc results might need to be
+    // truncated otherwise.
+    unsigned ExtendedWidth = 64;
+    if (Vec.getOpcode() == ISD::SETCC &&
+        Vec.getOperand(0).getValueSizeInBits() >= 128) {
+      ExtendedWidth = 128;
+    }
+    EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
 
     // any_ext doesn't work with umin/umax, so only use it for uadd.
     unsigned ExtendOp =
         ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
     SDValue Extended = DAG.getNode(
         ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
+    // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
+    // in that case we bitcast the sign extended values from v2i64 to v4i32
+    // before reduction for optimal code generation.
+    if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
+        NumElems == 2 && ExtendedWidth == 128) {
+      Extended = DAG.getBitcast(MVT::v4i32, Extended);
+      ExtendedVT = MVT::i32;
+    }
     switch (ScalarOpcode) {
     case ISD::AND:
       Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);

diff  --git a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
index 767ca91a58bb10..f317a7b8083421 100644
--- a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
+++ b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
@@ -12,8 +12,7 @@ define i1 @unordered_floating_point_compare_on_v8f32(<8 x float> %a_vec) {
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    mvn v0.16b, v0.16b
-; CHECK-NEXT:    xtn v0.8b, v0.8h
-; CHECK-NEXT:    umaxv b0, v0.8b
+; CHECK-NEXT:    umaxv h0, v0.8h
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    bic w0, w8, w9
 ; CHECK-NEXT:    ret

diff  --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
index 58020d28702b2f..625e8ae6a98dc2 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
@@ -15,8 +15,15 @@ declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
 declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
 declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
 
-define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v1:
+declare i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a)
+
+define i32 @reduce_and_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    smov w8, v0.b[0]
@@ -29,8 +36,8 @@ define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_and_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v2:
+define i32 @reduce_and_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
@@ -46,8 +53,8 @@ define i32 @reduce_and_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_and_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v4:
+define i32 @reduce_and_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v4i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
@@ -63,8 +70,8 @@ define i32 @reduce_and_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_and_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v8:
+define i32 @reduce_and_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
 ; CHECK-NEXT:    uminv b0, v0.8b
@@ -78,8 +85,8 @@ define i32 @reduce_and_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_and_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v16:
+define i32 @reduce_and_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    uminv b0, v0.16b
@@ -93,8 +100,8 @@ define i32 @reduce_and_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_and_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v32:
+define i32 @reduce_and_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
@@ -109,8 +116,193 @@ define i32 @reduce_and_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_or_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v1:
+define i32 @reduce_and_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smov w8, v0.h[0]
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    uminp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT:    uminv h0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    uminv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <8 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    uminv b0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <16 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    uminp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    uminv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uminv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <8 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    uminv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_and_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.2d, v1.2d, #0
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    uminv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v1i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    smov w8, v0.b[0]
@@ -123,8 +315,8 @@ define i32 @reduce_or_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_or_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v2:
+define i32 @reduce_or_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
@@ -140,8 +332,8 @@ define i32 @reduce_or_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_or_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v4:
+define i32 @reduce_or_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v4i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
@@ -157,8 +349,8 @@ define i32 @reduce_or_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_or_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v8:
+define i32 @reduce_or_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
 ; CHECK-NEXT:    umaxv b0, v0.8b
@@ -172,8 +364,8 @@ define i32 @reduce_or_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_or_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v16:
+define i32 @reduce_or_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    umaxv b0, v0.16b
@@ -187,8 +379,8 @@ define i32 @reduce_or_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   ret i32 %z
 }
 
-define i32 @reduce_or_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v32:
+define i32 @reduce_or_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v32i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
@@ -202,3 +394,468 @@ define i32 @reduce_or_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
   %z = select i1 %y, i32 %a1, i32 %a2
   ret i32 %z
 }
+
+define i32 @reduce_or_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smov w8, v0.h[0]
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    umaxp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT:    umaxv h0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    umaxv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <8 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    umaxv b0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <16 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    umaxp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    umaxv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <8 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_or_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.2d, v1.2d, #0
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    umaxv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v1i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smov w8, v0.b[0]
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i8> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #24
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #24
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i8> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i8> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <8 x i8> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <16 x i8> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <32 x i8> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v1i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    smov w8, v0.h[0]
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <8 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.8h, v1.8h, #0
+; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <16 x i16> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <8 x i32> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v1i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    cmp x8, #0
+; CHECK-NEXT:    csel w0, w0, w1, lt
+; CHECK-NEXT:    ret
+  %x = icmp slt <1 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    addp d0, v0.2d
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <2 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}
+
+define i32 @reduce_xor_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmlt v1.2d, v1.2d, #0
+; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    tst w8, #0x1
+; CHECK-NEXT:    csel w0, w0, w1, ne
+; CHECK-NEXT:    ret
+  %x = icmp slt <4 x i64> %a0, zeroinitializer
+  %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x)
+  %z = select i1 %y, i32 %a1, i32 %a2
+  ret i32 %z
+}

diff  --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
index c0f1720e1cf8b3..5212acc6fca0f4 100644
--- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
@@ -58,11 +58,11 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
 ; NEON-FIXED-NEXT:    cmtst v1.8h, v1.8h, v1.8h
 ; NEON-FIXED-NEXT:    adrp x8, .LCPI1_0
 ; NEON-FIXED-NEXT:    mov x9, sp
-; NEON-FIXED-NEXT:    ldr d2, [x8, :lo12:.LCPI1_0]
+; NEON-FIXED-NEXT:    ldr d3, [x8, :lo12:.LCPI1_0]
 ; NEON-FIXED-NEXT:    str q0, [sp]
-; NEON-FIXED-NEXT:    xtn v1.8b, v1.8h
-; NEON-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
-; NEON-FIXED-NEXT:    umaxv b1, v1.8b
+; NEON-FIXED-NEXT:    xtn v2.8b, v1.8h
+; NEON-FIXED-NEXT:    umaxv h1, v1.8h
+; NEON-FIXED-NEXT:    and v2.8b, v2.8b, v3.8b
 ; NEON-FIXED-NEXT:    umaxv b2, v2.8b
 ; NEON-FIXED-NEXT:    fmov w8, s2
 ; NEON-FIXED-NEXT:    bfi x9, x8, #1, #3
@@ -78,12 +78,12 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
 ; SVE-FIXED-NEXT:    sub sp, sp, #16
 ; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
 ; SVE-FIXED-NEXT:    cmtst v1.8h, v1.8h, v1.8h
-; SVE-FIXED-NEXT:    index z2.b, #0, #1
+; SVE-FIXED-NEXT:    index z3.b, #0, #1
 ; SVE-FIXED-NEXT:    mov x9, sp
 ; SVE-FIXED-NEXT:    str q0, [sp]
-; SVE-FIXED-NEXT:    xtn v1.8b, v1.8h
-; SVE-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT:    umaxv b1, v1.8b
+; SVE-FIXED-NEXT:    xtn v2.8b, v1.8h
+; SVE-FIXED-NEXT:    umaxv h1, v1.8h
+; SVE-FIXED-NEXT:    and v2.8b, v2.8b, v3.8b
 ; SVE-FIXED-NEXT:    umaxv b2, v2.8b
 ; SVE-FIXED-NEXT:    fmov w8, s2
 ; SVE-FIXED-NEXT:    bfi x9, x8, #1, #3
@@ -106,11 +106,11 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
 ; NEON-FIXED-NEXT:    cmtst v1.4s, v1.4s, v1.4s
 ; NEON-FIXED-NEXT:    adrp x8, .LCPI2_0
 ; NEON-FIXED-NEXT:    mov x9, sp
-; NEON-FIXED-NEXT:    ldr d2, [x8, :lo12:.LCPI2_0]
+; NEON-FIXED-NEXT:    ldr d3, [x8, :lo12:.LCPI2_0]
 ; NEON-FIXED-NEXT:    str q0, [sp]
-; NEON-FIXED-NEXT:    xtn v1.4h, v1.4s
-; NEON-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
-; NEON-FIXED-NEXT:    umaxv h1, v1.4h
+; NEON-FIXED-NEXT:    xtn v2.4h, v1.4s
+; NEON-FIXED-NEXT:    umaxv s1, v1.4s
+; NEON-FIXED-NEXT:    and v2.8b, v2.8b, v3.8b
 ; NEON-FIXED-NEXT:    umaxv h2, v2.4h
 ; NEON-FIXED-NEXT:    fmov w8, s2
 ; NEON-FIXED-NEXT:    bfi x9, x8, #2, #2
@@ -126,12 +126,12 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
 ; SVE-FIXED-NEXT:    sub sp, sp, #16
 ; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
 ; SVE-FIXED-NEXT:    cmtst v1.4s, v1.4s, v1.4s
-; SVE-FIXED-NEXT:    index z2.h, #0, #1
+; SVE-FIXED-NEXT:    index z3.h, #0, #1
 ; SVE-FIXED-NEXT:    mov x9, sp
 ; SVE-FIXED-NEXT:    str q0, [sp]
-; SVE-FIXED-NEXT:    xtn v1.4h, v1.4s
-; SVE-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT:    umaxv h1, v1.4h
+; SVE-FIXED-NEXT:    xtn v2.4h, v1.4s
+; SVE-FIXED-NEXT:    umaxv s1, v1.4s
+; SVE-FIXED-NEXT:    and v2.8b, v2.8b, v3.8b
 ; SVE-FIXED-NEXT:    umaxv h2, v2.4h
 ; SVE-FIXED-NEXT:    fmov w8, s2
 ; SVE-FIXED-NEXT:    bfi x9, x8, #2, #2
@@ -154,11 +154,11 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
 ; NEON-FIXED-NEXT:    cmtst v1.2d, v1.2d, v1.2d
 ; NEON-FIXED-NEXT:    adrp x8, .LCPI3_0
 ; NEON-FIXED-NEXT:    mov x9, sp
-; NEON-FIXED-NEXT:    ldr d2, [x8, :lo12:.LCPI3_0]
+; NEON-FIXED-NEXT:    ldr d3, [x8, :lo12:.LCPI3_0]
 ; NEON-FIXED-NEXT:    str q0, [sp]
-; NEON-FIXED-NEXT:    xtn v1.2s, v1.2d
-; NEON-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
-; NEON-FIXED-NEXT:    umaxp v1.2s, v1.2s, v1.2s
+; NEON-FIXED-NEXT:    xtn v2.2s, v1.2d
+; NEON-FIXED-NEXT:    umaxv s1, v1.4s
+; NEON-FIXED-NEXT:    and v2.8b, v2.8b, v3.8b
 ; NEON-FIXED-NEXT:    umaxp v2.2s, v2.2s, v2.2s
 ; NEON-FIXED-NEXT:    fmov w8, s2
 ; NEON-FIXED-NEXT:    bfi x9, x8, #3, #1
@@ -174,12 +174,12 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
 ; SVE-FIXED-NEXT:    sub sp, sp, #16
 ; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
 ; SVE-FIXED-NEXT:    cmtst v1.2d, v1.2d, v1.2d
-; SVE-FIXED-NEXT:    index z2.s, #0, #1
+; SVE-FIXED-NEXT:    index z3.s, #0, #1
 ; SVE-FIXED-NEXT:    mov x9, sp
 ; SVE-FIXED-NEXT:    str q0, [sp]
-; SVE-FIXED-NEXT:    xtn v1.2s, v1.2d
-; SVE-FIXED-NEXT:    and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT:    umaxp v1.2s, v1.2s, v1.2s
+; SVE-FIXED-NEXT:    xtn v2.2s, v1.2d
+; SVE-FIXED-NEXT:    umaxv s1, v1.4s
+; SVE-FIXED-NEXT:    and v2.8b, v2.8b, v3.8b
 ; SVE-FIXED-NEXT:    umaxp v2.2s, v2.2s, v2.2s
 ; SVE-FIXED-NEXT:    fmov w8, s2
 ; SVE-FIXED-NEXT:    bfi x9, x8, #3, #1
@@ -202,11 +202,11 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass
 ; NEON-FIXED-NEXT:    cmtst v1.4s, v1.4s, v1.4s
 ; NEON-FIXED-NEXT:    adrp x8, .LCPI4_0
 ; NEON-FIXED-NEXT:    mov x9, sp
-; NEON-FIXED-NEXT:    ldr d3, [x8, :lo12:.LCPI4_0]
+; NEON-FIXED-NEXT:    ldr d4, [x8, :lo12:.LCPI4_0]
 ; NEON-FIXED-NEXT:    str q0, [sp]
-; NEON-FIXED-NEXT:    xtn v1.4h, v1.4s
-; NEON-FIXED-NEXT:    and v3.8b, v1.8b, v3.8b
-; NEON-FIXED-NEXT:    umaxv h1, v1.4h
+; NEON-FIXED-NEXT:    xtn v3.4h, v1.4s
+; NEON-FIXED-NEXT:    umaxv s1, v1.4s
+; NEON-FIXED-NEXT:    and v3.8b, v3.8b, v4.8b
 ; NEON-FIXED-NEXT:    umaxv h3, v3.4h
 ; NEON-FIXED-NEXT:    fmov w8, s3
 ; NEON-FIXED-NEXT:    bfi x9, x8, #2, #2
@@ -222,12 +222,12 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass
 ; SVE-FIXED-NEXT:    sub sp, sp, #16
 ; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
 ; SVE-FIXED-NEXT:    cmtst v1.4s, v1.4s, v1.4s
-; SVE-FIXED-NEXT:    index z3.h, #0, #1
+; SVE-FIXED-NEXT:    index z4.h, #0, #1
 ; SVE-FIXED-NEXT:    mov x9, sp
 ; SVE-FIXED-NEXT:    str q0, [sp]
-; SVE-FIXED-NEXT:    xtn v1.4h, v1.4s
-; SVE-FIXED-NEXT:    and v3.8b, v1.8b, v3.8b
-; SVE-FIXED-NEXT:    umaxv h1, v1.4h
+; SVE-FIXED-NEXT:    xtn v3.4h, v1.4s
+; SVE-FIXED-NEXT:    umaxv s1, v1.4s
+; SVE-FIXED-NEXT:    and v3.8b, v3.8b, v4.8b
 ; SVE-FIXED-NEXT:    umaxv h3, v3.4h
 ; SVE-FIXED-NEXT:    fmov w8, s3
 ; SVE-FIXED-NEXT:    bfi x9, x8, #2, #2
@@ -250,11 +250,11 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
 ; NEON-FIXED-NEXT:    cmtst v1.2d, v1.2d, v1.2d
 ; NEON-FIXED-NEXT:    adrp x8, .LCPI5_0
 ; NEON-FIXED-NEXT:    mov x9, sp
-; NEON-FIXED-NEXT:    ldr d3, [x8, :lo12:.LCPI5_0]
+; NEON-FIXED-NEXT:    ldr d4, [x8, :lo12:.LCPI5_0]
 ; NEON-FIXED-NEXT:    str q0, [sp]
-; NEON-FIXED-NEXT:    xtn v1.2s, v1.2d
-; NEON-FIXED-NEXT:    and v3.8b, v1.8b, v3.8b
-; NEON-FIXED-NEXT:    umaxp v1.2s, v1.2s, v1.2s
+; NEON-FIXED-NEXT:    xtn v3.2s, v1.2d
+; NEON-FIXED-NEXT:    umaxv s1, v1.4s
+; NEON-FIXED-NEXT:    and v3.8b, v3.8b, v4.8b
 ; NEON-FIXED-NEXT:    umaxp v3.2s, v3.2s, v3.2s
 ; NEON-FIXED-NEXT:    fmov w8, s3
 ; NEON-FIXED-NEXT:    bfi x9, x8, #3, #1
@@ -270,12 +270,12 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
 ; SVE-FIXED-NEXT:    sub sp, sp, #16
 ; SVE-FIXED-NEXT:    .cfi_def_cfa_offset 16
 ; SVE-FIXED-NEXT:    cmtst v1.2d, v1.2d, v1.2d
-; SVE-FIXED-NEXT:    index z3.s, #0, #1
+; SVE-FIXED-NEXT:    index z4.s, #0, #1
 ; SVE-FIXED-NEXT:    mov x9, sp
 ; SVE-FIXED-NEXT:    str q0, [sp]
-; SVE-FIXED-NEXT:    xtn v1.2s, v1.2d
-; SVE-FIXED-NEXT:    and v3.8b, v1.8b, v3.8b
-; SVE-FIXED-NEXT:    umaxp v1.2s, v1.2s, v1.2s
+; SVE-FIXED-NEXT:    xtn v3.2s, v1.2d
+; SVE-FIXED-NEXT:    umaxv s1, v1.4s
+; SVE-FIXED-NEXT:    and v3.8b, v3.8b, v4.8b
 ; SVE-FIXED-NEXT:    umaxp v3.2s, v3.2s, v3.2s
 ; SVE-FIXED-NEXT:    fmov w8, s3
 ; SVE-FIXED-NEXT:    bfi x9, x8, #3, #1