[llvm] [AArch64] Use i32 extract from UADDV in popcount lowering. (PR #140718)
via llvm-commits
llvm-commits at lists.llvm.org
Tue May 20 04:59:44 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: David Green (davemgreen)
<details>
<summary>Changes</summary>
We need the top bits to be zeroes, but an v8i8->i32 EXTRACT_VECTOR_ELT will
anyext into the top bits. The instruction we create (UADDV) is known to be
zeroes in the upper bits, so we can convert to a larger v2i32 vector and
extract from there, similar to the operation currently performed for i64 types.
Fixes #<!-- -->140707
---
Full diff: https://github.com/llvm/llvm-project/pull/140718.diff
2 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+4-7)
- (modified) llvm/test/CodeGen/AArch64/popcount.ll (+109)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 293292d47dd48..64a422a195437 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10852,13 +10852,10 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
- if (VT == MVT::i32)
- AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, AddV,
- DAG.getConstant(0, DL, MVT::i64));
- else
- AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
- DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, AddV),
- DAG.getConstant(0, DL, MVT::i64));
+ AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
+ VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
+ AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
+ DAG.getConstant(0, DL, MVT::i64));
if (IsParity)
AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
return AddV;
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index e664e73594923..61f221988777f 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -648,4 +648,113 @@ Entry:
ret <4 x i16> %1
}
+define i32 @ctpop_into_extract(ptr %p) {
+; CHECKO0-LABEL: ctpop_into_extract:
+; CHECKO0: // %bb.0:
+; CHECKO0-NEXT: mov w8, #-1 // =0xffffffff
+; CHECKO0-NEXT: // implicit-def: $d1
+; CHECKO0-NEXT: // implicit-def: $q0
+; CHECKO0-NEXT: fmov d0, d1
+; CHECKO0-NEXT: mov v0.s[0], w8
+; CHECKO0-NEXT: fmov d2, d0
+; CHECKO0-NEXT: ldr d0, [x0]
+; CHECKO0-NEXT: fmov s1, s0
+; CHECKO0-NEXT: fmov w8, s1
+; CHECKO0-NEXT: fmov s1, w8
+; CHECKO0-NEXT: // kill: def $d1 killed $s1
+; CHECKO0-NEXT: cnt v1.8b, v1.8b
+; CHECKO0-NEXT: uaddlv h1, v1.8b
+; CHECKO0-NEXT: // kill: def $q1 killed $h1
+; CHECKO0-NEXT: // kill: def $s1 killed $s1 killed $q1
+; CHECKO0-NEXT: fmov w8, s1
+; CHECKO0-NEXT: // implicit-def: $q1
+; CHECKO0-NEXT: fmov d1, d2
+; CHECKO0-NEXT: mov v1.s[1], w8
+; CHECKO0-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECKO0-NEXT: sub v0.2s, v0.2s, v1.2s
+; CHECKO0-NEXT: str d0, [x0]
+; CHECKO0-NEXT: mov w0, wzr
+; CHECKO0-NEXT: ret
+;
+; CHECK-LABEL: ctpop_into_extract:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr d0, [x0]
+; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: fmov s1, w9
+; CHECK-NEXT: cnt v1.8b, v1.8b
+; CHECK-NEXT: addv b1, v1.8b
+; CHECK-NEXT: mov v2.s[1], v1.s[0]
+; CHECK-NEXT: sub v0.2s, v0.2s, v2.2s
+; CHECK-NEXT: str d0, [x8]
+; CHECK-NEXT: ret
+;
+; BE-LABEL: ctpop_into_extract:
+; BE: // %bb.0:
+; BE-NEXT: ld1 { v0.2s }, [x0]
+; BE-NEXT: movi v2.2d, #0xffffffffffffffff
+; BE-NEXT: mov x8, x0
+; BE-NEXT: mov w0, wzr
+; BE-NEXT: fmov w9, s0
+; BE-NEXT: fmov s1, w9
+; BE-NEXT: cnt v1.8b, v1.8b
+; BE-NEXT: addv b1, v1.8b
+; BE-NEXT: mov v2.s[1], v1.s[0]
+; BE-NEXT: sub v0.2s, v0.2s, v2.2s
+; BE-NEXT: st1 { v0.2s }, [x8]
+; BE-NEXT: ret
+;
+; GISEL-LABEL: ctpop_into_extract:
+; GISEL: // %bb.0:
+; GISEL-NEXT: ldr d0, [x0]
+; GISEL-NEXT: mov w9, #-1 // =0xffffffff
+; GISEL-NEXT: mov x8, x0
+; GISEL-NEXT: mov v2.s[0], w9
+; GISEL-NEXT: mov w0, wzr
+; GISEL-NEXT: fmov w10, s0
+; GISEL-NEXT: fmov s1, w10
+; GISEL-NEXT: cnt v1.8b, v1.8b
+; GISEL-NEXT: uaddlv h1, v1.8b
+; GISEL-NEXT: mov v2.s[1], v1.s[0]
+; GISEL-NEXT: sub v0.2s, v0.2s, v2.2s
+; GISEL-NEXT: str d0, [x8]
+; GISEL-NEXT: ret
+;
+; GISELO0-LABEL: ctpop_into_extract:
+; GISELO0: // %bb.0:
+; GISELO0-NEXT: mov w8, #-1 // =0xffffffff
+; GISELO0-NEXT: // implicit-def: $d1
+; GISELO0-NEXT: // implicit-def: $q0
+; GISELO0-NEXT: fmov d0, d1
+; GISELO0-NEXT: mov v0.s[0], w8
+; GISELO0-NEXT: fmov d2, d0
+; GISELO0-NEXT: ldr d0, [x0]
+; GISELO0-NEXT: fmov s1, s0
+; GISELO0-NEXT: fmov w8, s1
+; GISELO0-NEXT: fmov s1, w8
+; GISELO0-NEXT: // kill: def $d1 killed $s1
+; GISELO0-NEXT: cnt v1.8b, v1.8b
+; GISELO0-NEXT: uaddlv h1, v1.8b
+; GISELO0-NEXT: // kill: def $q1 killed $h1
+; GISELO0-NEXT: // kill: def $s1 killed $s1 killed $q1
+; GISELO0-NEXT: fmov w8, s1
+; GISELO0-NEXT: // implicit-def: $q1
+; GISELO0-NEXT: fmov d1, d2
+; GISELO0-NEXT: mov v1.s[1], w8
+; GISELO0-NEXT: // kill: def $d1 killed $d1 killed $q1
+; GISELO0-NEXT: sub v0.2s, v0.2s, v1.2s
+; GISELO0-NEXT: str d0, [x0]
+; GISELO0-NEXT: mov w0, wzr
+; GISELO0-NEXT: ret
+ %1 = load <2 x i32>, ptr %p, align 4
+ %2 = extractelement <2 x i32> %1, i64 0
+ %3 = call i32 @llvm.ctpop.i32(i32 %2)
+ %4 = insertelement <2 x i32> <i32 -1, i32 poison>, i32 %3, i64 1
+ %5 = sub <2 x i32> %1, %4
+ store <2 x i32> %5, ptr %p, align 4
+ ret i32 0
+}
+
declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>)
``````````
</details>
https://github.com/llvm/llvm-project/pull/140718
More information about the llvm-commits
mailing list