[llvm] [AArch64][SVE] Implement demanded bits for @llvm.aarch64.sve.cntp (PR #168714)

Wed Nov 19 06:38:23 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Benjamin Maxwell (MacDue)

<details>
<summary>Changes</summary>

This allows DemandedBits to see that the SVE CNTP intrinsic will only ever produce small positive integers. The maximum value you could get here is 256, which is CNTP on a nxv16i1 on a machine with a 2048bit vector size (the maximum for SVE).

Using this various redundant operations (zexts, sexts, ands, ors, etc) can be eliminated.

---
Full diff: https://github.com/llvm/llvm-project/pull/168714.diff


3 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+27-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-vector-compress.ll (+5-6) 
- (modified) llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll (+56) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8f41f230b5521..809c2af499958 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19459,6 +19459,32 @@ static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
   return {};
 }
 
+// Returns the element size associated with an SVE cnt[bhwdp] intrinsic. For
+// cntp (predicate), the element size corresponds to the legal (packed) SVE
+// vector type associated with the predicate. E.g. nxv4i1 returns 32.
+static std::optional<unsigned> GetSVECntElementSize(SDValue Op) {
+  if (auto ElementSize = IsSVECntIntrinsic(Op))
+    return ElementSize;
+  Intrinsic::ID IID = getIntrinsicID(Op.getNode());
+  if (IID != Intrinsic::aarch64_sve_cntp)
+    return {};
+  EVT PredVT = Op.getOperand(Op.getNumOperands() - 1).getValueType();
+  switch (PredVT.getSimpleVT().SimpleTy) {
+  case MVT::nxv1i1:
+    return 128;
+  case MVT::nxv2i1:
+    return 64;
+  case MVT::nxv4i1:
+    return 32;
+  case MVT::nxv8i1:
+    return 16;
+  case MVT::nxv16i1:
+    return 8;
+  default:
+    llvm_unreachable("unexpected predicate type");
+  }
+}
+
 /// Calculates what the pre-extend type is, based on the extension
 /// operation node provided by \p Extend.
 ///
@@ -31666,7 +31692,7 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
     return false;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
-    if (auto ElementSize = IsSVECntIntrinsic(Op)) {
+    if (auto ElementSize = GetSVECntElementSize(Op)) {
       unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
       if (!MaxSVEVectorSizeInBits)
         MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
index cc3a3734a9721..f700dee0fb2e4 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
@@ -143,20 +143,19 @@ define <vscale x 8 x i32> @test_compress_large(<vscale x 8 x i32> %vec, <vscale
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    cnth x9
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p2.s
 ; CHECK-NEXT:    sub x9, x9, #1
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    compact z0.s, p2, z0.s
-; CHECK-NEXT:    cntp x8, p1, p2.s
+; CHECK-NEXT:    compact z0.s, p1, z0.s
+; CHECK-NEXT:    cntp x8, p2, p1.s
 ; CHECK-NEXT:    compact z1.s, p0, z1.s
 ; CHECK-NEXT:    str z0, [sp]
-; CHECK-NEXT:    mov w8, w8
 ; CHECK-NEXT:    cmp x8, x9
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    st1w { z1.s }, p1, [x9, x8, lsl #2]
+; CHECK-NEXT:    st1w { z1.s }, p2, [x9, x8, lsl #2]
 ; CHECK-NEXT:    ldr z0, [sp]
 ; CHECK-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CHECK-NEXT:    addvl sp, sp, #2
diff --git a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
index 9572778484f8d..568abe718ad9b 100644
--- a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
+++ b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
@@ -80,6 +80,62 @@ define i64 @cntd_and_elimination() {
   ret i64 %result
 }
 
+define i64 @cntp_nxv16i1_and_elimination(<vscale x 16 x i1> %p) {
+; CHECK-LABEL: cntp_nxv16i1_and_elimination:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.b
+; CHECK-NEXT:    and x9, x8, #0x1fc
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+  %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv16i1(<vscale x 16 x i1> %p, <vscale x 16 x i1> %p)
+  %and_redundant = and i64 %cntp, 511
+  %and_required = and i64 %cntp, 17179869180
+  %result = add i64 %and_redundant, %and_required
+  ret i64 %result
+}
+
+define i64 @cntp_nxv8i1_and_elimination(<vscale x 8 x i1> %p) {
+; CHECK-LABEL: cntp_nxv8i1_and_elimination:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.h
+; CHECK-NEXT:    and x9, x8, #0xfc
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+  %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1> %p, <vscale x 8 x i1> %p)
+  %and_redundant = and i64 %cntp, 1023
+  %and_required = and i64 %cntp, 17179869180
+  %result = add i64 %and_redundant, %and_required
+  ret i64 %result
+}
+
+define i64 @cntp_nxv4i1_and_elimination(<vscale x 4 x i1> %p) {
+; CHECK-LABEL: cntp_nxv4i1_and_elimination:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.s
+; CHECK-NEXT:    and x9, x8, #0x7c
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+  %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> %p, <vscale x 4 x i1> %p)
+  %and_redundant = and i64 %cntp, 127
+  %and_required = and i64 %cntp, 17179869180
+  %result = add i64 %and_redundant, %and_required
+  ret i64 %result
+}
+
+define i64 @cntp_nxv2i1_and_elimination(<vscale x 2 x i1> %p) {
+; CHECK-LABEL: cntp_nxv2i1_and_elimination:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.d
+; CHECK-NEXT:    and x9, x8, #0x3c
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+  %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1> %p, <vscale x 2 x i1> %p)
+  %and_redundant = and i64 %cntp, 63
+  %and_required = and i64 %cntp, 17179869180
+  %result = add i64 %and_redundant, %and_required
+  ret i64 %result
+}
+
 define i64 @vscale_trunc_zext() vscale_range(1,16) {
 ; CHECK-LABEL: vscale_trunc_zext:
 ; CHECK:       // %bb.0:

``````````

</details>


https://github.com/llvm/llvm-project/pull/168714