[llvm] [AArch664] Replace uaddlv with addv for popcount operation (PR #121934)
Usha Gupta via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 8 08:22:56 PST 2025
https://github.com/usha1830 updated https://github.com/llvm/llvm-project/pull/121934
>From 0b1fc1c9ae3b96a93c45c53afe6c1f798d5a3c13 Mon Sep 17 00:00:00 2001
From: Usha Gupta <usha.gupta at arm.com>
Date: Mon, 6 Jan 2025 18:22:36 +0000
Subject: [PATCH] Replace uaddlv with addv for CTPOP operation as it is a
simpler operation and also saves a few cycles on certain platforms like
Cortex-A510.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 29 ++++-----
llvm/test/CodeGen/AArch64/arm64-popcnt.ll | 8 +--
llvm/test/CodeGen/AArch64/dp1.ll | 61 +++++++++++++++----
llvm/test/CodeGen/AArch64/parity.ll | 16 ++---
llvm/test/CodeGen/AArch64/popcount.ll | 16 +++--
5 files changed, 78 insertions(+), 52 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ef00b092fe5e06..4b762d8a8517df 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10700,37 +10700,30 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
// CNT V0.8B, V0.8B // 8xbyte pop-counts
// ADDV B0, V0.8B // sum 8xbyte pop-counts
- // UMOV X0, V0.B[0] // copy byte result back to integer reg
+ // FMOV X0, D0 // copy result back to integer reg
if (VT == MVT::i32 || VT == MVT::i64) {
if (VT == MVT::i32)
Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
- SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
- UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
+ SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
+ if (VT == MVT::i32)
+ AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
DAG.getConstant(0, DL, MVT::i64));
-
+ AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
if (IsParity)
- UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
- DAG.getConstant(1, DL, MVT::i32));
-
- if (VT == MVT::i64)
- UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
- return UaddLV;
+ AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
+ return AddV;
} else if (VT == MVT::i128) {
Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
- SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
- UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
- DAG.getConstant(0, DL, MVT::i64));
-
+ SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
+ AddV = DAG.getNode(ISD::BITCAST, DL, VT, AddV);
if (IsParity)
- UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
- DAG.getConstant(1, DL, MVT::i32));
-
- return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
+ AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
+ return AddV;
}
assert(!IsParity && "ISD::PARITY of vector types not supported");
diff --git a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index 0030e9ce80abb4..ad0904ff980806 100644
--- a/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -8,7 +8,7 @@ define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s0, w0
; CHECK-NEXT: cnt.8b v0, v0
-; CHECK-NEXT: uaddlv.8b h0, v0
+; CHECK-NEXT: addv.8b b0, v0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
@@ -43,7 +43,7 @@ define i32 @cnt32_advsimd_2(<2 x i32> %x) {
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: cnt.8b v0, v0
-; CHECK-NEXT: uaddlv.8b h0, v0
+; CHECK-NEXT: addv.8b b0, v0
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
@@ -79,8 +79,8 @@ define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: cnt.8b v0, v0
-; CHECK-NEXT: uaddlv.8b h0, v0
-; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: addv.8b b0, v0
+; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
;
; CHECK-NONEON-LABEL: cnt64_advsimd:
diff --git a/llvm/test/CodeGen/AArch64/dp1.ll b/llvm/test/CodeGen/AArch64/dp1.ll
index 949dad7798a6ca..38c19b107fd68c 100644
--- a/llvm/test/CodeGen/AArch64/dp1.ll
+++ b/llvm/test/CodeGen/AArch64/dp1.ll
@@ -197,16 +197,28 @@ define void @cttz_zeroundef_i64() {
}
define void @ctpop_i32() {
-; CHECK-LABEL: ctpop_i32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, :got:var32
-; CHECK-NEXT: ldr x8, [x8, :got_lo12:var32]
-; CHECK-NEXT: ldr w9, [x8]
-; CHECK-NEXT: fmov d0, x9
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlv h0, v0.8b
-; CHECK-NEXT: str s0, [x8]
-; CHECK-NEXT: ret
+; CHECK-SDAG-LABEL: ctpop_i32:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: adrp x8, :got:var32
+; CHECK-SDAG-NEXT: ldr x8, [x8, :got_lo12:var32]
+; CHECK-SDAG-NEXT: ldr w9, [x8]
+; CHECK-SDAG-NEXT: fmov d0, x9
+; CHECK-SDAG-NEXT: cnt v0.8b, v0.8b
+; CHECK-SDAG-NEXT: addv b0, v0.8b
+; CHECK-SDAG-NEXT: fmov w9, s0
+; CHECK-SDAG-NEXT: str w9, [x8]
+; CHECK-SDAG-NEXT: ret
+;
+; CHECK-GISEL-LABEL: ctpop_i32:
+; CHECK-GISEL: // %bb.0:
+; CHECK-GISEL-NEXT: adrp x8, :got:var32
+; CHECK-GISEL-NEXT: ldr x8, [x8, :got_lo12:var32]
+; CHECK-GISEL-NEXT: ldr w9, [x8]
+; CHECK-GISEL-NEXT: fmov d0, x9
+; CHECK-GISEL-NEXT: cnt v0.8b, v0.8b
+; CHECK-GISEL-NEXT: uaddlv h0, v0.8b
+; CHECK-GISEL-NEXT: str s0, [x8]
+; CHECK-GISEL-NEXT: ret
%val0_tmp = load i32, ptr @var32
%val4_tmp = call i32 @llvm.ctpop.i32(i32 %val0_tmp)
store volatile i32 %val4_tmp, ptr @var32
@@ -220,9 +232,8 @@ define void @ctpop_i64() {
; CHECK-SDAG-NEXT: ldr x8, [x8, :got_lo12:var64]
; CHECK-SDAG-NEXT: ldr d0, [x8]
; CHECK-SDAG-NEXT: cnt v0.8b, v0.8b
-; CHECK-SDAG-NEXT: uaddlv h0, v0.8b
-; CHECK-SDAG-NEXT: fmov w9, s0
-; CHECK-SDAG-NEXT: str x9, [x8]
+; CHECK-SDAG-NEXT: addv b0, v0.8b
+; CHECK-SDAG-NEXT: str d0, [x8]
; CHECK-SDAG-NEXT: ret
;
; CHECK-GISEL-LABEL: ctpop_i64:
@@ -243,6 +254,30 @@ define void @ctpop_i64() {
}
+define i64 @popcnt(i64 %a, ptr %p) {
+; CHECK-SDAG-LABEL: popcnt:
+; CHECK-SDAG: // %bb.0:
+; CHECK-SDAG-NEXT: fmov d0, x0
+; CHECK-SDAG-NEXT: mov x0, xzr
+; CHECK-SDAG-NEXT: cnt v0.8b, v0.8b
+; CHECK-SDAG-NEXT: addv b0, v0.8b
+; CHECK-SDAG-NEXT: str d0, [x1]
+; CHECK-SDAG-NEXT: ret
+;
+; CHECK-GISEL-LABEL: popcnt:
+; CHECK-GISEL: // %bb.0:
+; CHECK-GISEL-NEXT: fmov d0, x0
+; CHECK-GISEL-NEXT: mov x0, xzr
+; CHECK-GISEL-NEXT: cnt v0.8b, v0.8b
+; CHECK-GISEL-NEXT: uaddlv h0, v0.8b
+; CHECK-GISEL-NEXT: mov w8, v0.s[0]
+; CHECK-GISEL-NEXT: str x8, [x1]
+; CHECK-GISEL-NEXT: ret
+ %2 = call i64 @llvm.ctpop(i64 %a)
+ store i64 %2, ptr %p
+ ret i64 0
+}
+
declare i32 @llvm.bswap.i32(i32)
declare i64 @llvm.bswap.i64(i64)
declare i32 @llvm.ctlz.i32 (i32, i1)
diff --git a/llvm/test/CodeGen/AArch64/parity.ll b/llvm/test/CodeGen/AArch64/parity.ll
index 19dd185a6cb785..1e51793fb5f91b 100644
--- a/llvm/test/CodeGen/AArch64/parity.ll
+++ b/llvm/test/CodeGen/AArch64/parity.ll
@@ -114,9 +114,9 @@ define i64 @parity_64(i64 %x) {
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlv h0, v0.8b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x0, x8, #0x1
; CHECK-NEXT: ret
;
; CHECK-CSSC-LABEL: parity_64:
@@ -136,9 +136,9 @@ define i128 @parity_128(i128 %x) {
; CHECK-NEXT: mov v0.d[1], x1
; CHECK-NEXT: mov x1, xzr
; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlv h0, v0.16b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: and w0, w8, #0x1
+; CHECK-NEXT: addv b0, v0.16b
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: and x0, x8, #0x1
; CHECK-NEXT: ret
;
; CHECK-CSSC-LABEL: parity_128:
@@ -158,8 +158,8 @@ define i32 @parity_64_trunc(i64 %x) {
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlv h0, v0.8b
-; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: addv b0, v0.8b
+; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: and w0, w8, #0x1
; CHECK-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index f9f1cd4b1fcf76..89b1ac0a0edf18 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -28,7 +28,7 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
; CHECK-NEXT: add x8, x0, #8
; CHECK-NEXT: ld1 { v0.d }[1], [x8]
; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlv h0, v0.16b
+; CHECK-NEXT: addv b0, v0.16b
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
;
@@ -104,8 +104,8 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
; CHECK-NEXT: ld1 { v1.d }[1], [x8]
; CHECK-NEXT: cnt v0.16b, v0.16b
; CHECK-NEXT: cnt v1.16b, v1.16b
-; CHECK-NEXT: uaddlv h0, v0.16b
-; CHECK-NEXT: uaddlv h1, v1.16b
+; CHECK-NEXT: addv b0, v0.16b
+; CHECK-NEXT: addv b1, v1.16b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: fmov w9, s1
; CHECK-NEXT: add w0, w9, w8
@@ -191,12 +191,10 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
;
; CHECK-LABEL: popcount1x128:
; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: fmov d1, x0
-; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: mov v1.d[1], x1
-; CHECK-NEXT: cnt v1.16b, v1.16b
-; CHECK-NEXT: uaddlv h1, v1.16b
-; CHECK-NEXT: mov v0.s[0], v1.s[0]
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: mov v0.d[1], x1
+; CHECK-NEXT: cnt v0.16b, v0.16b
+; CHECK-NEXT: addv b0, v0.16b
; CHECK-NEXT: mov x1, v0.d[1]
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
More information about the llvm-commits
mailing list