[llvm] 5c74c6b - [AArch64] Use CMTST for != 0 vector compares (vnot (CMEQz A)).
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 24 01:39:55 PST 2021
Author: Florian Hahn
Date: 2021-02-24T09:39:27Z
New Revision: 5c74c6be3c291c27b78918aefe4017ea59b4ede8
URL: https://github.com/llvm/llvm-project/commit/5c74c6be3c291c27b78918aefe4017ea59b4ede8
DIFF: https://github.com/llvm/llvm-project/commit/5c74c6be3c291c27b78918aefe4017ea59b4ede8.diff
LOG: [AArch64] Use CMTST for != 0 vector compares (vnot (CMEQz A)).
(CMTST A, A) will only set elements to 0 if the element is 0 in A. Use
it for != 0 compares, which currently use (vnot (CMEQz A)). This saves a
mvn instruction.
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D97303
Added:
Modified:
llvm/lib/Target/AArch64/AArch64InstrInfo.td
llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
llvm/test/CodeGen/AArch64/vec_umulo.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d4871c311275..c1e1c4a18e80 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4193,6 +4193,9 @@ defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
+foreach VT = [ v8i8, v16i8, v4i16, v8i16, v2i32, v4i32, v2i64 ] in {
+def : Pat<(vnot (AArch64cmeqz VT:$Rn)), (!cast<Instruction>("CMTST"#VT) VT:$Rn, VT:$Rn)>;
+}
defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
let Predicates = [HasNEON] in {
foreach VT = [ v2f32, v4f32, v2f64 ] in
diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
index 40b9d9f384c5..b5a7ce1e78d9 100644
--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1241,8 +1241,7 @@ define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
; CHECK-LABEL: cmneqz8xi8:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmeq v0.8b, v0.8b, #0
-; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b
; CHECK-NEXT: ret
%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
@@ -1252,8 +1251,7 @@ define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
; CHECK-LABEL: cmneqz16xi8:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmeq v0.16b, v0.16b, #0
-; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b
; CHECK-NEXT: ret
%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
%tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
@@ -1263,8 +1261,7 @@ define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
; CHECK-LABEL: cmneqz4xi16:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmeq v0.4h, v0.4h, #0
-; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: cmtst v0.4h, v0.4h, v0.4h
; CHECK-NEXT: ret
%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
@@ -1274,8 +1271,7 @@ define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
; CHECK-LABEL: cmneqz8xi16:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmeq v0.8h, v0.8h, #0
-; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: cmtst v0.8h, v0.8h, v0.8h
; CHECK-NEXT: ret
%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
%tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
@@ -1285,8 +1281,7 @@ define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
; CHECK-LABEL: cmneqz2xi32:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmeq v0.2s, v0.2s, #0
-; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: cmtst v0.2s, v0.2s, v0.2s
; CHECK-NEXT: ret
%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
@@ -1296,8 +1291,7 @@ define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
; CHECK-LABEL: cmneqz4xi32:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s
; CHECK-NEXT: ret
%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
%tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
@@ -1307,8 +1301,7 @@ define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
; CHECK-LABEL: cmneqz2xi64:
; CHECK: // %bb.0:
-; CHECK-NEXT: cmeq v0.2d, v0.2d, #0
-; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: cmtst v0.2d, v0.2d, v0.2d
; CHECK-NEXT: ret
%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
%tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll
index bcc9acc174ff..880a5926b443 100644
--- a/llvm/test/CodeGen/AArch64/vec_umulo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll
@@ -21,8 +21,7 @@ define <1 x i32> @umulo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) noun
; CHECK: // %bb.0:
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: shrn v0.2s, v1.2d, #32
-; CHECK-NEXT: cmeq v0.2s, v0.2s, #0
-; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: cmtst v0.2s, v0.2s, v0.2s
; CHECK-NEXT: xtn v1.2s, v1.2d
; CHECK-NEXT: str s1, [x0]
; CHECK-NEXT: ret
@@ -39,8 +38,7 @@ define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) noun
; CHECK: // %bb.0:
; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s
; CHECK-NEXT: shrn v0.2s, v1.2d, #32
-; CHECK-NEXT: cmeq v0.2s, v0.2s, #0
-; CHECK-NEXT: mvn v0.8b, v0.8b
+; CHECK-NEXT: cmtst v0.2s, v0.2s, v0.2s
; CHECK-NEXT: xtn v1.2s, v1.2d
; CHECK-NEXT: str d1, [x0]
; CHECK-NEXT: ret
@@ -59,9 +57,8 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) noun
; CHECK-NEXT: umull v3.2d, v0.2s, v1.2s
; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s
; CHECK-NEXT: uzp2 v0.4s, v3.4s, v2.4s
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
; CHECK-NEXT: add x8, x0, #8 // =8
-; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s
; CHECK-NEXT: st1 { v1.s }[2], [x8]
; CHECK-NEXT: str d1, [x0]
; CHECK-NEXT: ret
@@ -79,8 +76,7 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) noun
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s
; CHECK-NEXT: umull v3.2d, v0.2s, v1.2s
; CHECK-NEXT: uzp2 v2.4s, v3.4s, v2.4s
-; CHECK-NEXT: cmeq v2.4s, v2.4s, #0
-; CHECK-NEXT: mvn v2.16b, v2.16b
+; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s
; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: mov v0.16b, v2.16b
@@ -120,10 +116,8 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) noun
; CHECK-NEXT: umull2 v0.2d, v3.4s, v2.4s
; CHECK-NEXT: umull v4.2d, v3.2s, v2.2s
; CHECK-NEXT: uzp2 v0.4s, v4.4s, v0.4s
-; CHECK-NEXT: cmeq v1.4s, v1.4s, #0
-; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
-; CHECK-NEXT: mvn v1.16b, v1.16b
-; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: cmtst v1.4s, v1.4s, v1.4s
+; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s
; CHECK-NEXT: mul v2.4s, v3.4s, v2.4s
; CHECK-NEXT: mov w5, v1.s[1]
; CHECK-NEXT: mov w1, v0.s[1]
@@ -151,11 +145,9 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) noun
; CHECK-NEXT: umull v0.2d, v1.2s, v3.2s
; CHECK-NEXT: mul v3.4s, v1.4s, v3.4s
; CHECK-NEXT: uzp2 v1.4s, v5.4s, v4.4s
-; CHECK-NEXT: uzp2 v0.4s, v0.4s, v6.4s
-; CHECK-NEXT: cmeq v1.4s, v1.4s, #0
-; CHECK-NEXT: cmeq v4.4s, v0.4s, #0
-; CHECK-NEXT: mvn v0.16b, v1.16b
-; CHECK-NEXT: mvn v1.16b, v4.16b
+; CHECK-NEXT: uzp2 v4.4s, v0.4s, v6.4s
+; CHECK-NEXT: cmtst v0.4s, v1.4s, v1.4s
+; CHECK-NEXT: cmtst v1.4s, v4.4s, v4.4s
; CHECK-NEXT: stp q2, q3, [x0]
; CHECK-NEXT: ret
%t = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
@@ -173,8 +165,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou
; CHECK-NEXT: umull v3.8h, v0.8b, v1.8b
; CHECK-NEXT: mul v4.16b, v0.16b, v1.16b
; CHECK-NEXT: uzp2 v0.16b, v3.16b, v2.16b
-; CHECK-NEXT: cmeq v0.16b, v0.16b, #0
-; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b
; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b
; CHECK-NEXT: zip2 v2.8b, v0.8b, v0.8b
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
@@ -209,8 +200,7 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun
; CHECK-NEXT: umull v3.4s, v0.4h, v1.4h
; CHECK-NEXT: mul v4.8h, v0.8h, v1.8h
; CHECK-NEXT: uzp2 v0.8h, v3.8h, v2.8h
-; CHECK-NEXT: cmeq v0.8h, v0.8h, #0
-; CHECK-NEXT: mvn v0.16b, v0.16b
+; CHECK-NEXT: cmtst v0.8h, v0.8h, v0.8h
; CHECK-NEXT: xtn v0.8b, v0.8h
; CHECK-NEXT: zip1 v1.8b, v0.8b, v0.8b
; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b
@@ -275,16 +265,15 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) noun
; CHECK-NEXT: mov w10, v0.s[1]
; CHECK-NEXT: fmov w11, s0
; CHECK-NEXT: cmeq v0.4s, v1.4s, #0
-; CHECK-NEXT: cmeq v1.4s, v2.4s, #0
+; CHECK-NEXT: cmtst v1.4s, v2.4s, v2.4s
; CHECK-NEXT: sturh w8, [x0, #9]
; CHECK-NEXT: lsr w8, w8, #16
-; CHECK-NEXT: mvn v1.16b, v1.16b
; CHECK-NEXT: strh w9, [x0, #6]
; CHECK-NEXT: sturh w10, [x0, #3]
; CHECK-NEXT: lsr w9, w9, #16
; CHECK-NEXT: lsr w10, w10, #16
-; CHECK-NEXT: strb w8, [x0, #11]
; CHECK-NEXT: orn v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: strb w8, [x0, #11]
; CHECK-NEXT: lsr w8, w11, #16
; CHECK-NEXT: strh w11, [x0]
; CHECK-NEXT: strb w9, [x0, #8]
@@ -306,21 +295,20 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: shrn v1.4h, v0.4s, #16
-; CHECK-NEXT: xtn v0.4h, v0.4s
-; CHECK-NEXT: umov w9, v0.h[1]
-; CHECK-NEXT: umov w8, v0.h[0]
+; CHECK-NEXT: xtn v2.4h, v0.4s
+; CHECK-NEXT: umov w9, v2.h[1]
+; CHECK-NEXT: umov w8, v2.h[0]
; CHECK-NEXT: and w9, w9, #0x1
+; CHECK-NEXT: shrn v1.4h, v0.4s, #16
; CHECK-NEXT: bfi w8, w9, #1, #1
-; CHECK-NEXT: umov w9, v0.h[2]
-; CHECK-NEXT: ushr v2.4h, v0.4h, #1
+; CHECK-NEXT: umov w9, v2.h[2]
+; CHECK-NEXT: cmeq v0.4h, v1.4h, #0
+; CHECK-NEXT: ushr v1.4h, v2.4h, #1
; CHECK-NEXT: and w9, w9, #0x1
+; CHECK-NEXT: cmtst v1.4h, v1.4h, v1.4h
; CHECK-NEXT: bfi w8, w9, #2, #1
-; CHECK-NEXT: umov w9, v0.h[3]
-; CHECK-NEXT: cmeq v0.4h, v2.4h, #0
-; CHECK-NEXT: cmeq v1.4h, v1.4h, #0
-; CHECK-NEXT: mvn v0.8b, v0.8b
-; CHECK-NEXT: orn v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: umov w9, v2.h[3]
+; CHECK-NEXT: orn v0.8b, v1.8b, v0.8b
; CHECK-NEXT: bfi w8, w9, #3, #29
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: and w8, w8, #0xf
More information about the llvm-commits
mailing list