[llvm] [AArch64] Improve code generation of bool vector reduce operations (PR #115713)
Csanád Hajdú via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 10 02:54:26 PST 2024
https://github.com/Il-Capitano updated https://github.com/llvm/llvm-project/pull/115713
>From 25172b911e10a1a434a91b80cf27aee2f50c825b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Csan=C3=A1d=20Hajd=C3=BA?= <csanad.hajdu at arm.com>
Date: Thu, 7 Nov 2024 10:37:49 +0100
Subject: [PATCH 1/3] [AArch64] Improve code generation of bool vector reduce
operations
* Avoid unnecessary truncation of comparison results in vecreduce_xor
* Optimize generated code for vecreduce_and and vecreduce_or by
comparing against 0.0 to check if all/any of the values are set
Alive2 proof of vecreduce_and and vecreduce_or transformation: https://alive2.llvm.org/ce/z/SRfPtw
---
.../Target/AArch64/AArch64ISelLowering.cpp | 51 +-
.../test/CodeGen/AArch64/dag-combine-setcc.ll | 26 +-
.../illegal-floating-point-vector-compares.ll | 6 +-
llvm/test/CodeGen/AArch64/reduce-and.ll | 18 +-
llvm/test/CodeGen/AArch64/reduce-or.ll | 15 +-
.../AArch64/vecreduce-and-legalization.ll | 6 +-
llvm/test/CodeGen/AArch64/vecreduce-bool.ll | 719 ++++++++++++++++--
.../AArch64/vecreduce-umax-legalization.ll | 5 +-
8 files changed, 749 insertions(+), 97 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cee609ed1e2f6f..c31f4fda3e2149 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15841,11 +15841,26 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
}
- // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
- // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
+ // Results of setcc operations get widened to 128 bits for xor reduce if
+ // their input operands are 128 bits wide, otherwise vectors that are less
+ // than 64 bits get widened to neatly fit a 64 bit register, so e.g.
+ // <4 x i1> gets lowered to either <4 x i16> or <4 x i32>. Sign extending to
// this element size leads to the best codegen, since e.g. setcc results
// might need to be truncated otherwise.
- EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
+ unsigned ExtendedWidth = 64;
+ if (ScalarOpcode == ISD::XOR && Vec.getOpcode() == ISD::SETCC &&
+ Vec.getOperand(0).getValueSizeInBits() >= 128) {
+ ExtendedWidth = 128;
+ }
+ EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
+
+ // Negate the reduced vector value for reduce and operations that use
+ // fcmp.
+ if (ScalarOpcode == ISD::AND && NumElems < 16) {
+ Vec = DAG.getNode(
+ ISD::XOR, DL, VecVT, Vec,
+ DAG.getSplatVector(VecVT, DL, DAG.getConstant(-1, DL, MVT::i32)));
+ }
// any_ext doesn't work with umin/umax, so only use it for uadd.
unsigned ExtendOp =
@@ -15854,10 +15869,36 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
switch (ScalarOpcode) {
case ISD::AND:
- Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
+ if (NumElems < 16) {
+ // Check if all lanes of the negated bool vector value are zero by
+ // comparing against 0.0 with ordered and equal predicate. The only
+ // non-zero bit pattern that compares ordered and equal to 0.0 is -0.0,
+ // where only the sign bit is set. However the bool vector is
+ // sign-extended so that each bit in a lane is either zero or one,
+ // meaning that it is impossible to get the bit pattern of -0.0.
+ assert(Extended.getValueSizeInBits() == 64);
+ Extended = DAG.getBitcast(MVT::f64, Extended);
+ Result = DAG.getNode(ISD::SETCC, DL, MVT::i32, Extended,
+ DAG.getConstantFP(0.0, DL, MVT::f64),
+ DAG.getCondCode(ISD::CondCode::SETOEQ));
+ } else {
+ Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
+ }
break;
case ISD::OR:
- Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
+ if (NumElems < 16) {
+ // Check if any lane of the bool vector is set by comparing against 0.0.
+ // NaN bit patterns are handled by using the 'unordered or not equal'
+ // predicate. Similarly to the reduce and case, -0.0 doesn't have to be
+ // handled here (see explanation above).
+ assert(Extended.getValueSizeInBits() == 64);
+ Extended = DAG.getBitcast(MVT::f64, Extended);
+ Result = DAG.getNode(ISD::SETCC, DL, MVT::i32, Extended,
+ DAG.getConstantFP(0.0, DL, MVT::f64),
+ DAG.getCondCode(ISD::CondCode::SETUNE));
+ } else {
+ Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
+ }
break;
case ISD::XOR:
Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
index a48a4e0e723ebc..fb366564723db6 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-setcc.ll
@@ -5,10 +5,8 @@ define i1 @combine_setcc_eq_vecreduce_or_v8i1(<8 x i8> %a) {
; CHECK-LABEL: combine_setcc_eq_vecreduce_or_v8i1:
; CHECK: // %bb.0:
; CHECK-NEXT: cmeq v0.8b, v0.8b, #0
-; CHECK-NEXT: mov w8, #1 // =0x1
-; CHECK-NEXT: umaxv b0, v0.8b
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: bic w0, w8, w9
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cmp1 = icmp eq <8 x i8> %a, zeroinitializer
%cast = bitcast <8 x i1> %cmp1 to i8
@@ -73,9 +71,8 @@ define i1 @combine_setcc_ne_vecreduce_or_v8i1(<8 x i8> %a) {
; CHECK-LABEL: combine_setcc_ne_vecreduce_or_v8i1:
; CHECK: // %bb.0:
; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b
-; CHECK-NEXT: umaxv b0, v0.8b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
%cmp1 = icmp ne <8 x i8> %a, zeroinitializer
%cast = bitcast <8 x i1> %cmp1 to i8
@@ -132,10 +129,9 @@ define i1 @combine_setcc_ne_vecreduce_or_v64i1(<64 x i8> %a) {
define i1 @combine_setcc_eq_vecreduce_and_v8i1(<8 x i8> %a) {
; CHECK-LABEL: combine_setcc_eq_vecreduce_and_v8i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmeq v0.8b, v0.8b, #0
-; CHECK-NEXT: uminv b0, v0.8b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%cmp1 = icmp eq <8 x i8> %a, zeroinitializer
%cast = bitcast <8 x i1> %cmp1 to i8
@@ -192,11 +188,9 @@ define i1 @combine_setcc_eq_vecreduce_and_v64i1(<64 x i8> %a) {
define i1 @combine_setcc_ne_vecreduce_and_v8i1(<8 x i8> %a) {
; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v8i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b
-; CHECK-NEXT: mov w8, #1 // =0x1
-; CHECK-NEXT: uminv b0, v0.8b
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: bic w0, w8, w9
+; CHECK-NEXT: cmeq v0.8b, v0.8b, #0
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
%cmp1 = icmp ne <8 x i8> %a, zeroinitializer
%cast = bitcast <8 x i1> %cmp1 to i8
diff --git a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
index 767ca91a58bb10..5374d4823034ff 100644
--- a/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
+++ b/llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll
@@ -9,13 +9,11 @@ define i1 @unordered_floating_point_compare_on_v8f32(<8 x float> %a_vec) {
; CHECK: // %bb.0:
; CHECK-NEXT: fcmgt v1.4s, v1.4s, #0.0
; CHECK-NEXT: fcmgt v0.4s, v0.4s, #0.0
-; CHECK-NEXT: mov w8, #1 // =0x1
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-NEXT: mvn v0.16b, v0.16b
; CHECK-NEXT: xtn v0.8b, v0.8h
-; CHECK-NEXT: umaxv b0, v0.8b
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: bic w0, w8, w9
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%a_cmp = fcmp ule <8 x float> %a_vec, zeroinitializer
%cmp_result = bitcast <8 x i1> %a_cmp to i8
diff --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll
index 8ca521327c2e31..62f3e8d184d24d 100644
--- a/llvm/test/CodeGen/AArch64/reduce-and.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-and.ll
@@ -20,11 +20,11 @@ define i1 @test_redand_v1i1(<1 x i1> %a) {
define i1 @test_redand_v2i1(<2 x i1> %a) {
; CHECK-LABEL: test_redand_v2i1:
; CHECK: // %bb.0:
+; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: shl v0.2s, v0.2s, #31
; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
-; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redand_v2i1:
@@ -42,11 +42,11 @@ define i1 @test_redand_v2i1(<2 x i1> %a) {
define i1 @test_redand_v4i1(<4 x i1> %a) {
; CHECK-LABEL: test_redand_v4i1:
; CHECK: // %bb.0:
+; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: shl v0.4h, v0.4h, #15
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
-; CHECK-NEXT: uminv h0, v0.4h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redand_v4i1:
@@ -68,11 +68,11 @@ define i1 @test_redand_v4i1(<4 x i1> %a) {
define i1 @test_redand_v8i1(<8 x i1> %a) {
; CHECK-LABEL: test_redand_v8i1:
; CHECK: // %bb.0:
+; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: shl v0.8b, v0.8b, #7
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT: uminv b0, v0.8b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redand_v8i1:
diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll
index aac31ce8b71b75..485cb7c916140c 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or.ll
@@ -22,9 +22,8 @@ define i1 @test_redor_v2i1(<2 x i1> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: shl v0.2s, v0.2s, #31
; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
-; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redor_v2i1:
@@ -44,9 +43,8 @@ define i1 @test_redor_v4i1(<4 x i1> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: shl v0.4h, v0.4h, #15
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
-; CHECK-NEXT: umaxv h0, v0.4h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redor_v4i1:
@@ -70,9 +68,8 @@ define i1 @test_redor_v8i1(<8 x i1> %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: shl v0.8b, v0.8b, #7
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT: umaxv b0, v0.8b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: cset w0, ne
; CHECK-NEXT: ret
;
; GISEL-LABEL: test_redor_v8i1:
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
index 7fa416e0dbcd5c..fd81deeb7d913b 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll
@@ -139,11 +139,11 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind {
define i1 @test_v4i1(<4 x i1> %a) nounwind {
; CHECK-LABEL: test_v4i1:
; CHECK: // %bb.0:
+; CHECK-NEXT: mvn v0.8b, v0.8b
; CHECK-NEXT: shl v0.4h, v0.4h, #15
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
-; CHECK-NEXT: uminv h0, v0.4h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: cset w0, eq
; CHECK-NEXT: ret
%b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
ret i1 %b
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
index 58020d28702b2f..10a3ef1658a965 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
@@ -15,8 +15,15 @@ declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
declare i1 @llvm.vector.reduce.or.v32i1(<32 x i1> %a)
-define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v1:
+declare i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %a)
+declare i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %a)
+
+define i32 @reduce_and_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.b[0]
@@ -29,16 +36,14 @@ define i32 @reduce_and_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
ret i32 %z
}
-define i32 @reduce_and_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v2:
+define i32 @reduce_and_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NEXT: sshr v0.2s, v0.2s, #24
-; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
-; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: cmge v0.2s, v0.2s, #0
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, eq
; CHECK-NEXT: ret
%x = icmp slt <2 x i8> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x)
@@ -46,16 +51,14 @@ define i32 @reduce_and_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
ret i32 %z
}
-define i32 @reduce_and_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v4:
+define i32 @reduce_and_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v0.4h, v0.4h, #8
; CHECK-NEXT: sshr v0.4h, v0.4h, #8
-; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
-; CHECK-NEXT: uminv h0, v0.4h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: cmge v0.4h, v0.4h, #0
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, eq
; CHECK-NEXT: ret
%x = icmp slt <4 x i8> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
@@ -63,14 +66,12 @@ define i32 @reduce_and_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
ret i32 %z
}
-define i32 @reduce_and_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v8:
+define i32 @reduce_and_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT: uminv b0, v0.8b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: cmge v0.8b, v0.8b, #0
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, eq
; CHECK-NEXT: ret
%x = icmp slt <8 x i8> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x)
@@ -78,8 +79,8 @@ define i32 @reduce_and_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
ret i32 %z
}
-define i32 @reduce_and_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v16:
+define i32 @reduce_and_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: uminv b0, v0.16b
@@ -93,8 +94,8 @@ define i32 @reduce_and_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
ret i32 %z
}
-define i32 @reduce_and_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_and_v32:
+define i32 @reduce_and_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
@@ -109,8 +110,182 @@ define i32 @reduce_and_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
ret i32 %z
}
-define i32 @reduce_or_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v1:
+define i32 @reduce_and_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: smov w8, v0.h[0]
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: ret
+ %x = icmp slt <1 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_and_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.2s, v0.2s, #16
+; CHECK-NEXT: sshr v0.2s, v0.2s, #16
+; CHECK-NEXT: cmge v0.2s, v0.2s, #0
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, eq
+; CHECK-NEXT: ret
+ %x = icmp slt <2 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_and_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.4h, v0.4h, #0
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, eq
+; CHECK-NEXT: ret
+ %x = icmp slt <4 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_and_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.8h, v0.8h, #0
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, eq
+; CHECK-NEXT: ret
+ %x = icmp slt <8 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_and_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
+; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: uminv b0, v0.16b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <16 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_and_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: ret
+ %x = icmp slt <1 x i32> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_and_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.2s, v0.2s, #0
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, eq
+; CHECK-NEXT: ret
+ %x = icmp slt <2 x i32> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_and_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, eq
+; CHECK-NEXT: ret
+ %x = icmp slt <4 x i32> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_and_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v1.4s, v1.4s, #0
+; CHECK-NEXT: cmge v0.4s, v0.4s, #0
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, eq
+; CHECK-NEXT: ret
+ %x = icmp slt <8 x i32> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_and_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: ret
+ %x = icmp slt <1 x i64> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_and_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v0.2d, v0.2d, #0
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, eq
+; CHECK-NEXT: ret
+ %x = icmp slt <2 x i64> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_and_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_and_v4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmge v1.2d, v1.2d, #0
+; CHECK-NEXT: cmge v0.2d, v0.2d, #0
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, eq
+; CHECK-NEXT: ret
+ %x = icmp slt <4 x i64> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_or_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v1i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.b[0]
@@ -123,15 +298,13 @@ define i32 @reduce_or_v1(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
ret i32 %z
}
-define i32 @reduce_or_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v2:
+define i32 @reduce_or_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v2i8:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v0.2s, v0.2s, #24
; CHECK-NEXT: sshr v0.2s, v0.2s, #24
; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
-; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: fcmp d0, #0.0
; CHECK-NEXT: csel w0, w0, w1, ne
; CHECK-NEXT: ret
%x = icmp slt <2 x i8> %a0, zeroinitializer
@@ -140,15 +313,13 @@ define i32 @reduce_or_v2(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
ret i32 %z
}
-define i32 @reduce_or_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v4:
+define i32 @reduce_or_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v4i8:
; CHECK: // %bb.0:
; CHECK-NEXT: shl v0.4h, v0.4h, #8
; CHECK-NEXT: sshr v0.4h, v0.4h, #8
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
-; CHECK-NEXT: umaxv h0, v0.4h
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: fcmp d0, #0.0
; CHECK-NEXT: csel w0, w0, w1, ne
; CHECK-NEXT: ret
%x = icmp slt <4 x i8> %a0, zeroinitializer
@@ -157,13 +328,11 @@ define i32 @reduce_or_v4(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
ret i32 %z
}
-define i32 @reduce_or_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v8:
+define i32 @reduce_or_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT: umaxv b0, v0.8b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: fcmp d0, #0.0
; CHECK-NEXT: csel w0, w0, w1, ne
; CHECK-NEXT: ret
%x = icmp slt <8 x i8> %a0, zeroinitializer
@@ -172,8 +341,8 @@ define i32 @reduce_or_v8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
ret i32 %z
}
-define i32 @reduce_or_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v16:
+define i32 @reduce_or_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: umaxv b0, v0.16b
@@ -187,8 +356,8 @@ define i32 @reduce_or_v16(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
ret i32 %z
}
-define i32 @reduce_or_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
-; CHECK-LABEL: reduce_or_v32:
+define i32 @reduce_or_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v32i8:
; CHECK: // %bb.0:
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
@@ -202,3 +371,457 @@ define i32 @reduce_or_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
%z = select i1 %y, i32 %a1, i32 %a2
ret i32 %z
}
+
+define i32 @reduce_or_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: smov w8, v0.h[0]
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: ret
+ %x = icmp slt <1 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_or_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.2s, v0.2s, #16
+; CHECK-NEXT: sshr v0.2s, v0.2s, #16
+; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <2 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_or_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <4 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_or_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <8 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_or_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
+; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: umaxv b0, v0.16b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <16 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_or_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: ret
+ %x = icmp slt <1 x i32> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_or_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <2 x i32> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_or_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <4 x i32> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_or_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: xtn v0.8b, v0.8h
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <8 x i32> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_or_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: ret
+ %x = icmp slt <1 x i64> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_or_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT: xtn v0.2s, v0.2d
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <2 x i64> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_or_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_or_v4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v1.2d, v1.2d, #0
+; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: xtn v0.4h, v0.4s
+; CHECK-NEXT: fcmp d0, #0.0
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <4 x i64> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: smov w8, v0.b[0]
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: ret
+ %x = icmp slt <1 x i8> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v2i8(<2 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.2s, v0.2s, #24
+; CHECK-NEXT: sshr v0.2s, v0.2s, #24
+; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <2 x i8> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v4i8(<4 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.4h, v0.4h, #8
+; CHECK-NEXT: sshr v0.4h, v0.4h, #8
+; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT: addv h0, v0.4h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <4 x i8> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v8i8(<8 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <8 x i8> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v16i8(<16 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT: addv b0, v0.16b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <16 x i8> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v32i8(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v32i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
+; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: addv b0, v0.16b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <32 x i8> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: smov w8, v0.h[0]
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: ret
+ %x = icmp slt <1 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v2i16(<2 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: shl v0.2s, v0.2s, #16
+; CHECK-NEXT: sshr v0.2s, v0.2s, #16
+; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <2 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v4i16(<4 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT: addv h0, v0.4h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <4 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v8i16(<8 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <8 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v16i16(<16 x i16> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
+; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: addv b0, v0.16b
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <16 x i16> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: ret
+ %x = icmp slt <1 x i32> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v2i32(<2 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
+; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <2 x i32> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v4i32(<4 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <4 x i32> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v8i32(<8 x i32> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: addv h0, v0.8h
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <8 x i32> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v8i1(<8 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: ret
+ %x = icmp slt <1 x i64> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT: addp d0, v0.2d
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <2 x i64> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v2i1(<2 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
+
+define i32 @reduce_xor_v4i64(<4 x i64> %a0, i32 %a1, i32 %a2) nounwind {
+; CHECK-LABEL: reduce_xor_v4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cmlt v1.2d, v1.2d, #0
+; CHECK-NEXT: cmlt v0.2d, v0.2d, #0
+; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: tst w8, #0x1
+; CHECK-NEXT: csel w0, w0, w1, ne
+; CHECK-NEXT: ret
+ %x = icmp slt <4 x i64> %a0, zeroinitializer
+ %y = call i1 @llvm.vector.reduce.xor.v4i1(<4 x i1> %x)
+ %z = select i1 %y, i32 %a1, i32 %a2
+ ret i32 %z
+}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index 809a6d6556a7be..2a21cc8d7c611d 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -202,9 +202,8 @@ define i1 @test_v4i1(<4 x i1> %a) nounwind {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: shl v0.4h, v0.4h, #15
; CHECK-SD-NEXT: cmlt v0.4h, v0.4h, #0
-; CHECK-SD-NEXT: umaxv h0, v0.4h
-; CHECK-SD-NEXT: fmov w8, s0
-; CHECK-SD-NEXT: and w0, w8, #0x1
+; CHECK-SD-NEXT: fcmp d0, #0.0
+; CHECK-SD-NEXT: cset w0, ne
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_v4i1:
>From 6d87db21adb6a62b5c5a5ac6d35d632536e883f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Csan=C3=A1d=20Hajd=C3=BA?= <csanad.hajdu at arm.com>
Date: Tue, 12 Nov 2024 11:56:19 +0100
Subject: [PATCH 2/3] Use DAG.getSetCC() instead of DAG.getNode()
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c31f4fda3e2149..d69e15895fa6d3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15878,9 +15878,9 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
// meaning that it is impossible to get the bit pattern of -0.0.
assert(Extended.getValueSizeInBits() == 64);
Extended = DAG.getBitcast(MVT::f64, Extended);
- Result = DAG.getNode(ISD::SETCC, DL, MVT::i32, Extended,
- DAG.getConstantFP(0.0, DL, MVT::f64),
- DAG.getCondCode(ISD::CondCode::SETOEQ));
+ Result =
+ DAG.getSetCC(DL, MVT::i32, Extended,
+ DAG.getConstantFP(0.0, DL, MVT::f64), ISD::SETOEQ);
} else {
Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
}
@@ -15893,9 +15893,9 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
// handled here (see explanation above).
assert(Extended.getValueSizeInBits() == 64);
Extended = DAG.getBitcast(MVT::f64, Extended);
- Result = DAG.getNode(ISD::SETCC, DL, MVT::i32, Extended,
- DAG.getConstantFP(0.0, DL, MVT::f64),
- DAG.getCondCode(ISD::CondCode::SETUNE));
+ Result =
+ DAG.getSetCC(DL, MVT::i32, Extended,
+ DAG.getConstantFP(0.0, DL, MVT::f64), ISD::SETUNE);
} else {
Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
}
>From 30970212f15668a3fc327249d10596844b1c1027 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Csan=C3=A1d=20Hajd=C3=BA?= <csanad.hajdu at arm.com>
Date: Tue, 10 Dec 2024 11:47:59 +0100
Subject: [PATCH 3/3] Rebase on main
Required changes:
* Use APInt::getAllOnes() instead of using -1 in DAG.getConstant()
* A new test case was added that is affected by this change
---
.../Target/AArch64/AArch64ISelLowering.cpp | 3 +-
.../AArch64/vector-extract-last-active.ll | 40 +++++--------------
2 files changed, 12 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d69e15895fa6d3..424f18ba4d822f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15859,7 +15859,8 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
if (ScalarOpcode == ISD::AND && NumElems < 16) {
Vec = DAG.getNode(
ISD::XOR, DL, VecVT, Vec,
- DAG.getSplatVector(VecVT, DL, DAG.getConstant(-1, DL, MVT::i32)));
+ DAG.getSplatVector(
+ VecVT, DL, DAG.getConstant(APInt::getAllOnes(32), DL, MVT::i32)));
}
// any_ext doesn't work with umin/umax, so only use it for uadd.
diff --git a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
index c0f1720e1cf8b3..593c9db090a26d 100644
--- a/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/AArch64/vector-extract-last-active.ll
@@ -62,13 +62,11 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
; NEON-FIXED-NEXT: str q0, [sp]
; NEON-FIXED-NEXT: xtn v1.8b, v1.8h
; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; NEON-FIXED-NEXT: umaxv b1, v1.8b
+; NEON-FIXED-NEXT: fcmp d1, #0.0
; NEON-FIXED-NEXT: umaxv b2, v2.8b
; NEON-FIXED-NEXT: fmov w8, s2
; NEON-FIXED-NEXT: bfi x9, x8, #1, #3
; NEON-FIXED-NEXT: ldrh w8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
; NEON-FIXED-NEXT: csel w0, w8, w0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
@@ -83,13 +81,11 @@ define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) {
; SVE-FIXED-NEXT: str q0, [sp]
; SVE-FIXED-NEXT: xtn v1.8b, v1.8h
; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT: umaxv b1, v1.8b
+; SVE-FIXED-NEXT: fcmp d1, #0.0
; SVE-FIXED-NEXT: umaxv b2, v2.8b
; SVE-FIXED-NEXT: fmov w8, s2
; SVE-FIXED-NEXT: bfi x9, x8, #1, #3
; SVE-FIXED-NEXT: ldrh w8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
; SVE-FIXED-NEXT: csel w0, w8, w0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
@@ -110,13 +106,11 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
; NEON-FIXED-NEXT: str q0, [sp]
; NEON-FIXED-NEXT: xtn v1.4h, v1.4s
; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; NEON-FIXED-NEXT: umaxv h1, v1.4h
+; NEON-FIXED-NEXT: fcmp d1, #0.0
; NEON-FIXED-NEXT: umaxv h2, v2.4h
; NEON-FIXED-NEXT: fmov w8, s2
; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
; NEON-FIXED-NEXT: ldr w8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
; NEON-FIXED-NEXT: csel w0, w8, w0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
@@ -131,13 +125,11 @@ define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) {
; SVE-FIXED-NEXT: str q0, [sp]
; SVE-FIXED-NEXT: xtn v1.4h, v1.4s
; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT: umaxv h1, v1.4h
+; SVE-FIXED-NEXT: fcmp d1, #0.0
; SVE-FIXED-NEXT: umaxv h2, v2.4h
; SVE-FIXED-NEXT: fmov w8, s2
; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
; SVE-FIXED-NEXT: ldr w8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
; SVE-FIXED-NEXT: csel w0, w8, w0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
@@ -158,13 +150,11 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
; NEON-FIXED-NEXT: str q0, [sp]
; NEON-FIXED-NEXT: xtn v1.2s, v1.2d
; NEON-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; NEON-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; NEON-FIXED-NEXT: fcmp d1, #0.0
; NEON-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s
; NEON-FIXED-NEXT: fmov w8, s2
; NEON-FIXED-NEXT: bfi x9, x8, #3, #1
; NEON-FIXED-NEXT: ldr x8, [x9]
-; NEON-FIXED-NEXT: fmov w9, s1
-; NEON-FIXED-NEXT: tst w9, #0x1
; NEON-FIXED-NEXT: csel x0, x8, x0, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
@@ -179,13 +169,11 @@ define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) {
; SVE-FIXED-NEXT: str q0, [sp]
; SVE-FIXED-NEXT: xtn v1.2s, v1.2d
; SVE-FIXED-NEXT: and v2.8b, v1.8b, v2.8b
-; SVE-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; SVE-FIXED-NEXT: fcmp d1, #0.0
; SVE-FIXED-NEXT: umaxp v2.2s, v2.2s, v2.2s
; SVE-FIXED-NEXT: fmov w8, s2
; SVE-FIXED-NEXT: bfi x9, x8, #3, #1
; SVE-FIXED-NEXT: ldr x8, [x9]
-; SVE-FIXED-NEXT: fmov w9, s1
-; SVE-FIXED-NEXT: tst w9, #0x1
; SVE-FIXED-NEXT: csel x0, x8, x0, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
@@ -206,13 +194,11 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass
; NEON-FIXED-NEXT: str q0, [sp]
; NEON-FIXED-NEXT: xtn v1.4h, v1.4s
; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
-; NEON-FIXED-NEXT: umaxv h1, v1.4h
+; NEON-FIXED-NEXT: fcmp d1, #0.0
; NEON-FIXED-NEXT: umaxv h3, v3.4h
; NEON-FIXED-NEXT: fmov w8, s3
; NEON-FIXED-NEXT: bfi x9, x8, #2, #2
-; NEON-FIXED-NEXT: fmov w8, s1
; NEON-FIXED-NEXT: ldr s0, [x9]
-; NEON-FIXED-NEXT: tst w8, #0x1
; NEON-FIXED-NEXT: fcsel s0, s0, s2, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
@@ -227,13 +213,11 @@ define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %pass
; SVE-FIXED-NEXT: str q0, [sp]
; SVE-FIXED-NEXT: xtn v1.4h, v1.4s
; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
-; SVE-FIXED-NEXT: umaxv h1, v1.4h
+; SVE-FIXED-NEXT: fcmp d1, #0.0
; SVE-FIXED-NEXT: umaxv h3, v3.4h
; SVE-FIXED-NEXT: fmov w8, s3
; SVE-FIXED-NEXT: bfi x9, x8, #2, #2
-; SVE-FIXED-NEXT: fmov w8, s1
; SVE-FIXED-NEXT: ldr s0, [x9]
-; SVE-FIXED-NEXT: tst w8, #0x1
; SVE-FIXED-NEXT: fcsel s0, s0, s2, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
@@ -254,13 +238,11 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
; NEON-FIXED-NEXT: str q0, [sp]
; NEON-FIXED-NEXT: xtn v1.2s, v1.2d
; NEON-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
-; NEON-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; NEON-FIXED-NEXT: fcmp d1, #0.0
; NEON-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s
; NEON-FIXED-NEXT: fmov w8, s3
; NEON-FIXED-NEXT: bfi x9, x8, #3, #1
-; NEON-FIXED-NEXT: fmov w8, s1
; NEON-FIXED-NEXT: ldr d0, [x9]
-; NEON-FIXED-NEXT: tst w8, #0x1
; NEON-FIXED-NEXT: fcsel d0, d0, d2, ne
; NEON-FIXED-NEXT: add sp, sp, #16
; NEON-FIXED-NEXT: ret
@@ -275,13 +257,11 @@ define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %
; SVE-FIXED-NEXT: str q0, [sp]
; SVE-FIXED-NEXT: xtn v1.2s, v1.2d
; SVE-FIXED-NEXT: and v3.8b, v1.8b, v3.8b
-; SVE-FIXED-NEXT: umaxp v1.2s, v1.2s, v1.2s
+; SVE-FIXED-NEXT: fcmp d1, #0.0
; SVE-FIXED-NEXT: umaxp v3.2s, v3.2s, v3.2s
; SVE-FIXED-NEXT: fmov w8, s3
; SVE-FIXED-NEXT: bfi x9, x8, #3, #1
-; SVE-FIXED-NEXT: fmov w8, s1
; SVE-FIXED-NEXT: ldr d0, [x9]
-; SVE-FIXED-NEXT: tst w8, #0x1
; SVE-FIXED-NEXT: fcsel d0, d0, d2, ne
; SVE-FIXED-NEXT: add sp, sp, #16
; SVE-FIXED-NEXT: ret
More information about the llvm-commits
mailing list