[llvm] [AArch64] Improve scalar and Neon popcount with SVE CNT. (PR #143870)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 12 04:02:55 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Ricardo Jesus (rj-jesus)
<details>
<summary>Changes</summary>
When available, we can use SVE's CNT instruction to improve the lowering of scalar and fixed-length popcount (CTPOP) since the SVE instruction supports types that the Neon variant doesn't.
For the scalar types, I see the following speedups on NVIDIA Grace CPU:
| size (bits) | before (Gibit/s) | after (Gibit/s) | speedup |
|-----:|----------:|----------:|--------:|
| 32 | 75.20 | 86.79 | 1.15 |
| 64 | 149.87 | 173.70 | 1.16 |
| 128 | 158.56 | 164.88 | 1.04 |
---
Full diff: https://github.com/llvm/llvm-project/pull/143870.diff
3 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+33-5)
- (modified) llvm/test/CodeGen/AArch64/popcount.ll (+227-75)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll (+24-24)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9f51caef6d228..6c4ca9e4e5233 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10614,13 +10614,13 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
return SDValue();
EVT VT = Op.getValueType();
- if (VT.isScalableVector() ||
- useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
+ assert((!Subtarget->isNeonAvailable() ||
+ (VT != MVT::v8i8 && VT != MVT::v16i8)) &&
+ "Unexpected custom lowering for B vectors with Neon available.");
+ bool OverrideNEON = !Subtarget->isNeonAvailable() || VT.isFixedLengthVector();
+ if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
- if (!Subtarget->isNeonAvailable())
- return SDValue();
-
bool IsParity = Op.getOpcode() == ISD::PARITY;
SDValue Val = Op.getOperand(0);
SDLoc DL(Op);
@@ -10630,6 +10630,34 @@ SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
if (VT == MVT::i32 && IsParity)
return SDValue();
+ if (Subtarget->isSVEorStreamingSVEAvailable()) {
+ assert((VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i128) &&
+ "Unexpected type for custom ctpop lowering.");
+ if (VT == MVT::i32 || VT == MVT::i64) {
+ EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
+ Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
+ DAG.getUNDEF(ContainerVT), Val,
+ DAG.getVectorIdxConstant(0, DL));
+ Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
+ Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
+ DAG.getVectorIdxConstant(0, DL));
+ } else if (VT == MVT::i128) {
+ Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
+ Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
+ Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
+ Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
+ Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
+ Val = DAG.getZExtOrTrunc(Val, DL, VT);
+ } else
+ llvm_unreachable("Unexpected type!");
+ if (IsParity)
+ Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
+ return Val;
+ }
+
+ if (!Subtarget->isNeonAvailable())
+ return SDValue();
+
// If there is no CNT instruction available, GPR popcount can
// be more efficiently lowered to the following sequence that uses
// AdvSIMD registers/instructions as long as the copies to/from
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 61f221988777f..e4d9c62664b58 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -23,15 +23,36 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
; CHECKO0-NEXT: fmov w0, s0
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: popcount128:
-; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: add x8, x0, #8
-; CHECK-NEXT: ld1 { v0.d }[1], [x8]
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: addv b0, v0.16b
-; CHECK-NEXT: fmov w0, s0
-; CHECK-NEXT: ret
+; NEON-LABEL: popcount128:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: ldr d0, [x0]
+; NEON-NEXT: add x8, x0, #8
+; NEON-NEXT: ld1 { v0.d }[1], [x8]
+; NEON-NEXT: cnt v0.16b, v0.16b
+; NEON-NEXT: addv b0, v0.16b
+; NEON-NEXT: fmov w0, s0
+; NEON-NEXT: ret
+;
+; DOT-LABEL: popcount128:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: ldr d0, [x0]
+; DOT-NEXT: add x8, x0, #8
+; DOT-NEXT: ld1 { v0.d }[1], [x8]
+; DOT-NEXT: cnt v0.16b, v0.16b
+; DOT-NEXT: addv b0, v0.16b
+; DOT-NEXT: fmov w0, s0
+; DOT-NEXT: ret
+;
+; SVE-LABEL: popcount128:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: ldr d0, [x0]
+; SVE-NEXT: add x8, x0, #8
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: ld1 { v0.d }[1], [x8]
+; SVE-NEXT: cnt z0.d, p0/m, z0.d
+; SVE-NEXT: addp d0, v0.2d
+; SVE-NEXT: fmov w0, s0
+; SVE-NEXT: ret
;
; BE-LABEL: popcount128:
; BE: // %bb.0: // %Entry
@@ -107,22 +128,55 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
; CHECKO0-NEXT: mov w0, w8
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: popcount256:
-; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: ldr d0, [x0, #16]
-; CHECK-NEXT: ldr d1, [x0]
-; CHECK-NEXT: add x8, x0, #8
-; CHECK-NEXT: add x9, x0, #24
-; CHECK-NEXT: ld1 { v0.d }[1], [x9]
-; CHECK-NEXT: ld1 { v1.d }[1], [x8]
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: cnt v1.16b, v1.16b
-; CHECK-NEXT: addv b0, v0.16b
-; CHECK-NEXT: addv b1, v1.16b
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: fmov w9, s1
-; CHECK-NEXT: add w0, w9, w8
-; CHECK-NEXT: ret
+; NEON-LABEL: popcount256:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: ldr d0, [x0, #16]
+; NEON-NEXT: ldr d1, [x0]
+; NEON-NEXT: add x8, x0, #8
+; NEON-NEXT: add x9, x0, #24
+; NEON-NEXT: ld1 { v0.d }[1], [x9]
+; NEON-NEXT: ld1 { v1.d }[1], [x8]
+; NEON-NEXT: cnt v0.16b, v0.16b
+; NEON-NEXT: cnt v1.16b, v1.16b
+; NEON-NEXT: addv b0, v0.16b
+; NEON-NEXT: addv b1, v1.16b
+; NEON-NEXT: fmov w8, s0
+; NEON-NEXT: fmov w9, s1
+; NEON-NEXT: add w0, w9, w8
+; NEON-NEXT: ret
+;
+; DOT-LABEL: popcount256:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: ldr d0, [x0, #16]
+; DOT-NEXT: ldr d1, [x0]
+; DOT-NEXT: add x8, x0, #8
+; DOT-NEXT: add x9, x0, #24
+; DOT-NEXT: ld1 { v0.d }[1], [x9]
+; DOT-NEXT: ld1 { v1.d }[1], [x8]
+; DOT-NEXT: cnt v0.16b, v0.16b
+; DOT-NEXT: cnt v1.16b, v1.16b
+; DOT-NEXT: addv b0, v0.16b
+; DOT-NEXT: addv b1, v1.16b
+; DOT-NEXT: fmov w8, s0
+; DOT-NEXT: fmov w9, s1
+; DOT-NEXT: add w0, w9, w8
+; DOT-NEXT: ret
+;
+; SVE-LABEL: popcount256:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: ldr d0, [x0, #16]
+; SVE-NEXT: ldr d1, [x0]
+; SVE-NEXT: add x8, x0, #8
+; SVE-NEXT: add x9, x0, #24
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: ld1 { v0.d }[1], [x9]
+; SVE-NEXT: ld1 { v1.d }[1], [x8]
+; SVE-NEXT: cnt z0.d, p0/m, z0.d
+; SVE-NEXT: cnt z1.d, p0/m, z1.d
+; SVE-NEXT: add v0.2d, v1.2d, v0.2d
+; SVE-NEXT: addp d0, v0.2d
+; SVE-NEXT: fmov w0, s0
+; SVE-NEXT: ret
;
; BE-LABEL: popcount256:
; BE: // %bb.0: // %Entry
@@ -223,15 +277,36 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
; CHECKO0-NEXT: bfi x0, x8, #32, #32
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: popcount1x128:
-; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: mov v0.d[1], x1
-; CHECK-NEXT: mov x1, xzr
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: addv b0, v0.16b
-; CHECK-NEXT: fmov x0, d0
-; CHECK-NEXT: ret
+; NEON-LABEL: popcount1x128:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: fmov d0, x0
+; NEON-NEXT: mov v0.d[1], x1
+; NEON-NEXT: mov x1, xzr
+; NEON-NEXT: cnt v0.16b, v0.16b
+; NEON-NEXT: addv b0, v0.16b
+; NEON-NEXT: fmov x0, d0
+; NEON-NEXT: ret
+;
+; DOT-LABEL: popcount1x128:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: fmov d0, x0
+; DOT-NEXT: mov v0.d[1], x1
+; DOT-NEXT: mov x1, xzr
+; DOT-NEXT: cnt v0.16b, v0.16b
+; DOT-NEXT: addv b0, v0.16b
+; DOT-NEXT: fmov x0, d0
+; DOT-NEXT: ret
+;
+; SVE-LABEL: popcount1x128:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: fmov d0, x0
+; SVE-NEXT: ptrue p0.d
+; SVE-NEXT: mov v0.d[1], x1
+; SVE-NEXT: mov x1, xzr
+; SVE-NEXT: cnt z0.d, p0/m, z0.d
+; SVE-NEXT: addp d0, v0.2d
+; SVE-NEXT: fmov x0, d0
+; SVE-NEXT: ret
;
; BE-LABEL: popcount1x128:
; BE: // %bb.0: // %Entry
@@ -305,10 +380,10 @@ define <2 x i64> @popcount2x64(<2 x i64> %0) {
;
; SVE-LABEL: popcount2x64:
; SVE: // %bb.0: // %Entry
-; SVE-NEXT: cnt v0.16b, v0.16b
-; SVE-NEXT: uaddlp v0.8h, v0.16b
-; SVE-NEXT: uaddlp v0.4s, v0.8h
-; SVE-NEXT: uaddlp v0.2d, v0.4s
+; SVE-NEXT: ptrue p0.d, vl2
+; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE-NEXT: cnt z0.d, p0/m, z0.d
+; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: ret
;
; BE-LABEL: popcount2x64:
@@ -374,13 +449,29 @@ define <1 x i64> @popcount1x64(<1 x i64> %0) {
; CHECKO0-NEXT: fmov d0, x8
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: popcount1x64:
-; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-NEXT: uaddlp v0.1d, v0.2s
-; CHECK-NEXT: ret
+; NEON-LABEL: popcount1x64:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: cnt v0.8b, v0.8b
+; NEON-NEXT: uaddlp v0.4h, v0.8b
+; NEON-NEXT: uaddlp v0.2s, v0.4h
+; NEON-NEXT: uaddlp v0.1d, v0.2s
+; NEON-NEXT: ret
+;
+; DOT-LABEL: popcount1x64:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: cnt v0.8b, v0.8b
+; DOT-NEXT: uaddlp v0.4h, v0.8b
+; DOT-NEXT: uaddlp v0.2s, v0.4h
+; DOT-NEXT: uaddlp v0.1d, v0.2s
+; DOT-NEXT: ret
+;
+; SVE-LABEL: popcount1x64:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: ptrue p0.d, vl1
+; SVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE-NEXT: cnt z0.d, p0/m, z0.d
+; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE-NEXT: ret
;
; BE-LABEL: popcount1x64:
; BE: // %bb.0: // %Entry
@@ -442,9 +533,10 @@ define <4 x i32> @popcount4x32(<4 x i32> %0) {
;
; SVE-LABEL: popcount4x32:
; SVE: // %bb.0: // %Entry
-; SVE-NEXT: cnt v0.16b, v0.16b
-; SVE-NEXT: uaddlp v0.8h, v0.16b
-; SVE-NEXT: uaddlp v0.4s, v0.8h
+; SVE-NEXT: ptrue p0.s, vl4
+; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE-NEXT: cnt z0.s, p0/m, z0.s
+; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
; SVE-NEXT: ret
;
; BE-LABEL: popcount4x32:
@@ -520,9 +612,10 @@ define <2 x i32> @popcount2x32(<2 x i32> %0) {
;
; SVE-LABEL: popcount2x32:
; SVE: // %bb.0: // %Entry
-; SVE-NEXT: cnt v0.8b, v0.8b
-; SVE-NEXT: uaddlp v0.4h, v0.8b
-; SVE-NEXT: uaddlp v0.2s, v0.4h
+; SVE-NEXT: ptrue p0.s, vl2
+; SVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE-NEXT: cnt z0.s, p0/m, z0.s
+; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0
; SVE-NEXT: ret
;
; BE-LABEL: popcount2x32:
@@ -577,11 +670,25 @@ define <8 x i16> @popcount8x16(<8 x i16> %0) {
; CHECKO0-NEXT: uaddlp v0.8h, v0.16b
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: popcount8x16:
-; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: ret
+; NEON-LABEL: popcount8x16:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: cnt v0.16b, v0.16b
+; NEON-NEXT: uaddlp v0.8h, v0.16b
+; NEON-NEXT: ret
+;
+; DOT-LABEL: popcount8x16:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: cnt v0.16b, v0.16b
+; DOT-NEXT: uaddlp v0.8h, v0.16b
+; DOT-NEXT: ret
+;
+; SVE-LABEL: popcount8x16:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: ptrue p0.h, vl8
+; SVE-NEXT: // kill: def $q0 killed $q0 def $z0
+; SVE-NEXT: cnt z0.h, p0/m, z0.h
+; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0
+; SVE-NEXT: ret
;
; BE-LABEL: popcount8x16:
; BE: // %bb.0: // %Entry
@@ -618,11 +725,25 @@ define <4 x i16> @popcount4x16(<4 x i16> %0) {
; CHECKO0-NEXT: uaddlp v0.4h, v0.8b
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: popcount4x16:
-; CHECK: // %bb.0: // %Entry
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: ret
+; NEON-LABEL: popcount4x16:
+; NEON: // %bb.0: // %Entry
+; NEON-NEXT: cnt v0.8b, v0.8b
+; NEON-NEXT: uaddlp v0.4h, v0.8b
+; NEON-NEXT: ret
+;
+; DOT-LABEL: popcount4x16:
+; DOT: // %bb.0: // %Entry
+; DOT-NEXT: cnt v0.8b, v0.8b
+; DOT-NEXT: uaddlp v0.4h, v0.8b
+; DOT-NEXT: ret
+;
+; SVE-LABEL: popcount4x16:
+; SVE: // %bb.0: // %Entry
+; SVE-NEXT: ptrue p0.h, vl4
+; SVE-NEXT: // kill: def $d0 killed $d0 def $z0
+; SVE-NEXT: cnt z0.h, p0/m, z0.h
+; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0
+; SVE-NEXT: ret
;
; BE-LABEL: popcount4x16:
; BE: // %bb.0: // %Entry
@@ -676,20 +797,49 @@ define i32 @ctpop_into_extract(ptr %p) {
; CHECKO0-NEXT: mov w0, wzr
; CHECKO0-NEXT: ret
;
-; CHECK-LABEL: ctpop_into_extract:
-; CHECK: // %bb.0:
-; CHECK-NEXT: ldr d0, [x0]
-; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff
-; CHECK-NEXT: mov x8, x0
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: fmov s1, w9
-; CHECK-NEXT: cnt v1.8b, v1.8b
-; CHECK-NEXT: addv b1, v1.8b
-; CHECK-NEXT: mov v2.s[1], v1.s[0]
-; CHECK-NEXT: sub v0.2s, v0.2s, v2.2s
-; CHECK-NEXT: str d0, [x8]
-; CHECK-NEXT: ret
+; NEON-LABEL: ctpop_into_extract:
+; NEON: // %bb.0:
+; NEON-NEXT: ldr d0, [x0]
+; NEON-NEXT: movi v2.2d, #0xffffffffffffffff
+; NEON-NEXT: mov x8, x0
+; NEON-NEXT: mov w0, wzr
+; NEON-NEXT: fmov w9, s0
+; NEON-NEXT: fmov s1, w9
+; NEON-NEXT: cnt v1.8b, v1.8b
+; NEON-NEXT: addv b1, v1.8b
+; NEON-NEXT: mov v2.s[1], v1.s[0]
+; NEON-NEXT: sub v0.2s, v0.2s, v2.2s
+; NEON-NEXT: str d0, [x8]
+; NEON-NEXT: ret
+;
+; DOT-LABEL: ctpop_into_extract:
+; DOT: // %bb.0:
+; DOT-NEXT: ldr d0, [x0]
+; DOT-NEXT: movi v2.2d, #0xffffffffffffffff
+; DOT-NEXT: mov x8, x0
+; DOT-NEXT: mov w0, wzr
+; DOT-NEXT: fmov w9, s0
+; DOT-NEXT: fmov s1, w9
+; DOT-NEXT: cnt v1.8b, v1.8b
+; DOT-NEXT: addv b1, v1.8b
+; DOT-NEXT: mov v2.s[1], v1.s[0]
+; DOT-NEXT: sub v0.2s, v0.2s, v2.2s
+; DOT-NEXT: str d0, [x8]
+; DOT-NEXT: ret
+;
+; SVE-LABEL: ctpop_into_extract:
+; SVE: // %bb.0:
+; SVE-NEXT: ldr d0, [x0]
+; SVE-NEXT: ptrue p0.s
+; SVE-NEXT: movi v2.2d, #0xffffffffffffffff
+; SVE-NEXT: mov x8, x0
+; SVE-NEXT: mov w0, wzr
+; SVE-NEXT: movprfx z1, z0
+; SVE-NEXT: cnt z1.s, p0/m, z0.s
+; SVE-NEXT: mov v2.s[1], v1.s[0]
+; SVE-NEXT: sub v0.2s, v0.2s, v2.2s
+; SVE-NEXT: str d0, [x8]
+; SVE-NEXT: ret
;
; BE-LABEL: ctpop_into_extract:
; BE: // %bb.0:
@@ -758,3 +908,5 @@ define i32 @ctpop_into_extract(ptr %p) {
}
declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
index 1e71c4b66156c..b62b850434469 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll
@@ -457,23 +457,25 @@ define void @ctpop_v256i8(ptr %a) vscale_range(16,0) #0 {
ret void
}
-; Don't use SVE for 64-bit vectors.
define <4 x i16> @ctpop_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
+; CHECK-NEXT: ptrue p0.h, vl4
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: cnt z0.h, p0/m, z0.h
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op)
ret <4 x i16> %res
}
-; Don't use SVE for 128-bit vectors.
define <8 x i16> @ctpop_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
+; CHECK-NEXT: ptrue p0.h, vl8
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: cnt z0.h, p0/m, z0.h
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op)
ret <8 x i16> %res
@@ -547,25 +549,25 @@ define void @ctpop_v128i16(ptr %a) vscale_range(16,0) #0 {
ret void
}
-; Don't use SVE for 64-bit vectors.
define <2 x i32> @ctpop_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uaddlp v0.2s, v0.4h
+; CHECK-NEXT: ptrue p0.s, vl2
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: cnt z0.s, p0/m, z0.s
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op)
ret <2 x i32> %res
}
-; Don't use SVE for 128-bit vectors.
define <4 x i32> @ctpop_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: uaddlp v0.4s, v0.8h
+; CHECK-NEXT: ptrue p0.s, vl4
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: cnt z0.s, p0/m, z0.s
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op)
ret <4 x i32> %res
@@ -639,27 +641,25 @@ define void @ctpop_v64i32(ptr %a) vscale_range(16,0) #0 {
ret void
}
-; Don't use SVE for 64-bit vectors.
define <1 x i64> @ctpop_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: cnt v0.8b, v0.8b
-; CHECK-NEXT: uaddlp v0.4h, v0.8b
-; CHECK-NEXT: uaddlp v0.2s, v0.4h
-; CHECK-NEXT: uaddlp v0.1d, v0.2s
+; CHECK-NEXT: ptrue p0.d, vl1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT: cnt z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret
%res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op)
ret <1 x i64> %res
}
-; Don't use SVE for 128-bit vectors.
define <2 x i64> @ctpop_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
; CHECK-LABEL: ctpop_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: cnt v0.16b, v0.16b
-; CHECK-NEXT: uaddlp v0.8h, v0.16b
-; CHECK-NEXT: uaddlp v0.4s, v0.8h
-; CHECK-NEXT: uaddlp v0.2d, v0.4s
+; CHECK-NEXT: ptrue p0.d, vl2
+; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT: cnt z0.d, p0/m, z0.d
+; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret
%res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op)
ret <2 x i64> %res
``````````
</details>
https://github.com/llvm/llvm-project/pull/143870
More information about the llvm-commits
mailing list