[llvm] [AArch64] Use i32 extract from UADDV in popcount lowering. (PR #140718)
David Green via llvm-commits
llvm-commits at lists.llvm.org
Tue May 20 04:59:08 PDT 2025
https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/140718
We need the top bits to be zeroes, but an v8i8->i32 EXTRACT_VECTOR_ELT will
anyext into the top bits. The instruction we create (UADDV) is known to be
zeroes in the upper bits, so we can convert to a larger v2i32 vector and
extract from there, similar to the operation currently performed for i64 types.
Fixes #140707
>From e233002607d80de90f76db7d38b8545bdc118571 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 20 May 2025 12:32:31 +0100
Subject: [PATCH 1/2] [AArch64] Add a test for extract from ctpop, #140707. NFC
---
llvm/test/CodeGen/AArch64/popcount.ll | 109 ++++++++++++++++++++++++++
1 file changed, 109 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index e664e73594923..560b5ce6e28fb 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -648,4 +648,113 @@ Entry:
ret <4 x i16> %1
}
+define i32 @ctpop_into_extract(ptr %p) {
+; CHECKO0-LABEL: ctpop_into_extract:
+; CHECKO0: // %bb.0:
+; CHECKO0-NEXT: mov w8, #-1 // =0xffffffff
+; CHECKO0-NEXT: // implicit-def: $d1
+; CHECKO0-NEXT: // implicit-def: $q0
+; CHECKO0-NEXT: fmov d0, d1
+; CHECKO0-NEXT: mov v0.s[0], w8
+; CHECKO0-NEXT: fmov d2, d0
+; CHECKO0-NEXT: ldr d0, [x0]
+; CHECKO0-NEXT: fmov s1, s0
+; CHECKO0-NEXT: fmov w8, s1
+; CHECKO0-NEXT: fmov s1, w8
+; CHECKO0-NEXT: // kill: def $d1 killed $s1
+; CHECKO0-NEXT: cnt v1.8b, v1.8b
+; CHECKO0-NEXT: uaddlv h1, v1.8b
+; CHECKO0-NEXT: // kill: def $q1 killed $h1
+; CHECKO0-NEXT: // kill: def $s1 killed $s1 killed $q1
+; CHECKO0-NEXT: fmov w8, s1
+; CHECKO0-NEXT: // implicit-def: $q1
+; CHECKO0-NEXT: fmov d1, d2
+; CHECKO0-NEXT: mov v1.s[1], w8
+; CHECKO0-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECKO0-NEXT: sub v0.2s, v0.2s, v1.2s
+; CHECKO0-NEXT: str d0, [x0]
+; CHECKO0-NEXT: mov w0, wzr
+; CHECKO0-NEXT: ret
+;
+; CHECK-LABEL: ctpop_into_extract:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: cnt v1.8b, v1.8b
+; CHECK-NEXT: addv b1, v1.8b
+; CHECK-NEXT: mov v2.b[4], v1.b[0]
+; CHECK-NEXT: sub v0.2s, v0.2s, v2.2s
+; CHECK-NEXT: str d0, [x8]
+; CHECK-NEXT: ret
+;
+; BE-LABEL: ctpop_into_extract:
+; BE: // %bb.0:
+; BE-NEXT: ld1 { v0.2s }, [x0]
+; BE-NEXT: movi v2.2d, #0xffffffffffffffff
+; BE-NEXT: mov x8, x0
+; BE-NEXT: mov w0, wzr
+; BE-NEXT: fmov w9, s0
+; BE-NEXT: fmov s1, w9
+; BE-NEXT: cnt v1.8b, v1.8b
+; BE-NEXT: addv b1, v1.8b
+; BE-NEXT: mov v2.b[4], v1.b[0]
+; BE-NEXT: sub v0.2s, v0.2s, v2.2s
+; BE-NEXT: st1 { v0.2s }, [x8]
+; BE-NEXT: ret
+;
+; GISEL-LABEL: ctpop_into_extract:
+; GISEL: // %bb.0:
+; GISEL-NEXT: ldr d0, [x0]
+; GISEL-NEXT: mov w9, #-1 // =0xffffffff
+; GISEL-NEXT: mov x8, x0
+; GISEL-NEXT: mov v2.s[0], w9
+; GISEL-NEXT: mov w0, wzr
+; GISEL-NEXT: fmov w10, s0
+; GISEL-NEXT: fmov s1, w10
+; GISEL-NEXT: cnt v1.8b, v1.8b
+; GISEL-NEXT: uaddlv h1, v1.8b
+; GISEL-NEXT: mov v2.s[1], v1.s[0]
+; GISEL-NEXT: sub v0.2s, v0.2s, v2.2s
+; GISEL-NEXT: str d0, [x8]
+; GISEL-NEXT: ret
+;
+; GISELO0-LABEL: ctpop_into_extract:
+; GISELO0: // %bb.0:
+; GISELO0-NEXT: mov w8, #-1 // =0xffffffff
+; GISELO0-NEXT: // implicit-def: $d1
+; GISELO0-NEXT: // implicit-def: $q0
+; GISELO0-NEXT: fmov d0, d1
+; GISELO0-NEXT: mov v0.s[0], w8
+; GISELO0-NEXT: fmov d2, d0
+; GISELO0-NEXT: ldr d0, [x0]
+; GISELO0-NEXT: fmov s1, s0
+; GISELO0-NEXT: fmov w8, s1
+; GISELO0-NEXT: fmov s1, w8
+; GISELO0-NEXT: // kill: def $d1 killed $s1
+; GISELO0-NEXT: cnt v1.8b, v1.8b
+; GISELO0-NEXT: uaddlv h1, v1.8b
+; GISELO0-NEXT: // kill: def $q1 killed $h1
+; GISELO0-NEXT: // kill: def $s1 killed $s1 killed $q1
+; GISELO0-NEXT: fmov w8, s1
+; GISELO0-NEXT: // implicit-def: $q1
+; GISELO0-NEXT: fmov d1, d2
+; GISELO0-NEXT: mov v1.s[1], w8
+; GISELO0-NEXT: // kill: def $d1 killed $d1 killed $q1
+; GISELO0-NEXT: sub v0.2s, v0.2s, v1.2s
+; GISELO0-NEXT: str d0, [x0]
+; GISELO0-NEXT: mov w0, wzr
+; GISELO0-NEXT: ret
+ %1 = load <2 x i32>, ptr %p, align 4
+ %2 = extractelement <2 x i32> %1, i64 0
+ %3 = call i32 @llvm.ctpop.i32(i32 %2)
+ %4 = insertelement <2 x i32> <i32 -1, i32 poison>, i32 %3, i64 1
+ %5 = sub <2 x i32> %1, %4
+ store <2 x i32> %5, ptr %p, align 4
+ ret i32 0
+}
+
declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>)
>From 8789d19ddc0ec297d7c03edcc1a990bb4221da27 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 20 May 2025 12:56:27 +0100
Subject: [PATCH 2/2] [AArch64] Use i32 extract from UADDV in popcount
lowering.
We need the top bits to be zeroes, but an v8i8->i32 EXTRACT_VECTOR_ELT will
anyext into the top bits. The instruction we create (UADDV) is known to be
zeroes in the upper bits, so we can convert to a larger v2i32 vector and
extract from there, similar to the operation currently performed for i64 types.
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 11 ++++-------
llvm/test/CodeGen/AArch64/popcount.ll | 4 ++--
2 files changed, 6 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 293292d47dd48..64a422a195437 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10852,13 +10852,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
- if (VT == MVT::i32)
- AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
- DAG.getConstant(0, DL, MVT::i64));
- else
- AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
- DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, AddV),
- DAG.getConstant(0, DL, MVT::i64));
+ AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
+ VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
+ AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
+ DAG.getConstant(0, DL, MVT::i64));
if (IsParity)
AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
return AddV;
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 560b5ce6e28fb..61f221988777f 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -686,7 +686,7 @@ define i32 @ctpop_into_extract(ptr %p) {
; CHECK-NEXT: fmov s1, w9
; CHECK-NEXT: cnt v1.8b, v1.8b
; CHECK-NEXT: addv b1, v1.8b
-; CHECK-NEXT: mov v2.b[4], v1.b[0]
+; CHECK-NEXT: mov v2.s[1], v1.s[0]
; CHECK-NEXT: sub v0.2s, v0.2s, v2.2s
; CHECK-NEXT: str d0, [x8]
; CHECK-NEXT: ret
@@ -701,7 +701,7 @@ define i32 @ctpop_into_extract(ptr %p) {
; BE-NEXT: fmov s1, w9
; BE-NEXT: cnt v1.8b, v1.8b
; BE-NEXT: addv b1, v1.8b
-; BE-NEXT: mov v2.b[4], v1.b[0]
+; BE-NEXT: mov v2.s[1], v1.s[0]
; BE-NEXT: sub v0.2s, v0.2s, v2.2s
; BE-NEXT: st1 { v0.2s }, [x8]
; BE-NEXT: ret
More information about the llvm-commits
mailing list