[llvm] [AArch64][SVE] Implement demanded bits for @llvm.aarch64.sve.cntp (PR #168714)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 19 08:59:34 PST 2025
https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/168714
>From 5e42dbb2cbd63ca4922e0bdaf3ab75c46b29e5ea Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 19 Nov 2025 14:29:19 +0000
Subject: [PATCH 1/4] Precommit tests
---
.../vscale-and-sve-cnt-demandedbits.ll | 60 +++++++++++++++++++
1 file changed, 60 insertions(+)
diff --git a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
index 9572778484f8d..62290c31f464d 100644
--- a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
+++ b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
@@ -80,6 +80,66 @@ define i64 @cntd_and_elimination() {
ret i64 %result
}
+define i64 @cntp_nxv16i1_and_elimination(<vscale x 16 x i1> %p) {
+; CHECK-LABEL: cntp_nxv16i1_and_elimination:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.b
+; CHECK-NEXT: and x9, x8, #0x1ff
+; CHECK-NEXT: and x8, x8, #0x3fffffffc
+; CHECK-NEXT: add x0, x9, x8
+; CHECK-NEXT: ret
+ %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv16i1(<vscale x 16 x i1> %p, <vscale x 16 x i1> %p)
+ %and_redundant = and i64 %cntp, 511
+ %and_required = and i64 %cntp, 17179869180
+ %result = add i64 %and_redundant, %and_required
+ ret i64 %result
+}
+
+define i64 @cntp_nxv8i1_and_elimination(<vscale x 8 x i1> %p) {
+; CHECK-LABEL: cntp_nxv8i1_and_elimination:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.h
+; CHECK-NEXT: and x9, x8, #0x3ff
+; CHECK-NEXT: and x8, x8, #0x3fffffffc
+; CHECK-NEXT: add x0, x9, x8
+; CHECK-NEXT: ret
+ %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1> %p, <vscale x 8 x i1> %p)
+ %and_redundant = and i64 %cntp, 1023
+ %and_required = and i64 %cntp, 17179869180
+ %result = add i64 %and_redundant, %and_required
+ ret i64 %result
+}
+
+define i64 @cntp_nxv4i1_and_elimination(<vscale x 4 x i1> %p) {
+; CHECK-LABEL: cntp_nxv4i1_and_elimination:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.s
+; CHECK-NEXT: and x9, x8, #0x7f
+; CHECK-NEXT: and x8, x8, #0x3fffffffc
+; CHECK-NEXT: add x0, x9, x8
+; CHECK-NEXT: ret
+ %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> %p, <vscale x 4 x i1> %p)
+ %and_redundant = and i64 %cntp, 127
+ %and_required = and i64 %cntp, 17179869180
+ %result = add i64 %and_redundant, %and_required
+ ret i64 %result
+}
+
+define i64 @cntp_nxv2i1_and_elimination(<vscale x 2 x i1> %p) {
+; CHECK-LABEL: cntp_nxv2i1_and_elimination:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntp x8, p0, p0.d
+; CHECK-NEXT: and x9, x8, #0x3f
+; CHECK-NEXT: and x8, x8, #0x3fffffffc
+; CHECK-NEXT: add x0, x9, x8
+; CHECK-NEXT: ret
+ %cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1> %p, <vscale x 2 x i1> %p)
+ %and_redundant = and i64 %cntp, 63
+ %and_required = and i64 %cntp, 17179869180
+ %result = add i64 %and_redundant, %and_required
+ ret i64 %result
+}
+
define i64 @vscale_trunc_zext() vscale_range(1,16) {
; CHECK-LABEL: vscale_trunc_zext:
; CHECK: // %bb.0:
>From e3224e441f982878106cb07dab7a99353780576e Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 19 Nov 2025 14:31:24 +0000
Subject: [PATCH 2/4] [AArch64][SVE] Implement demanded bits for
@llvm.aarch64.sve.cntp
This allows DemandedBits to see that the SVE CNTP intrinsic will only
ever produce small positive integers. The maximum value you could get
here is 256, which is CNTP on a nxv16i1 on a machine with a 2048bit
vector size (the maximum for SVE).
Using this various redundant operations (zexts, sexts, ands, ors, etc)
can be eliminated.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 28 ++++++++++++++++++-
.../CodeGen/AArch64/sve-vector-compress.ll | 11 ++++----
.../vscale-and-sve-cnt-demandedbits.ll | 20 ++++++-------
3 files changed, 40 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8f41f230b5521..809c2af499958 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19459,6 +19459,32 @@ static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
return {};
}
+// Returns the element size associated with an SVE cnt[bhwdp] intrinsic. For
+// cntp (predicate), the element size corresponds to the legal (packed) SVE
+// vector type associated with the predicate. E.g. nxv4i1 returns 32.
+static std::optional<unsigned> GetSVECntElementSize(SDValue Op) {
+ if (auto ElementSize = IsSVECntIntrinsic(Op))
+ return ElementSize;
+ Intrinsic::ID IID = getIntrinsicID(Op.getNode());
+ if (IID != Intrinsic::aarch64_sve_cntp)
+ return {};
+ EVT PredVT = Op.getOperand(Op.getNumOperands() - 1).getValueType();
+ switch (PredVT.getSimpleVT().SimpleTy) {
+ case MVT::nxv1i1:
+ return 128;
+ case MVT::nxv2i1:
+ return 64;
+ case MVT::nxv4i1:
+ return 32;
+ case MVT::nxv8i1:
+ return 16;
+ case MVT::nxv16i1:
+ return 8;
+ default:
+ llvm_unreachable("unexpected predicate type");
+ }
+}
+
/// Calculates what the pre-extend type is, based on the extension
/// operation node provided by \p Extend.
///
@@ -31666,7 +31692,7 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
return false;
}
case ISD::INTRINSIC_WO_CHAIN: {
- if (auto ElementSize = IsSVECntIntrinsic(Op)) {
+ if (auto ElementSize = GetSVECntElementSize(Op)) {
unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
if (!MaxSVEVectorSizeInBits)
MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
index cc3a3734a9721..f700dee0fb2e4 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
@@ -143,20 +143,19 @@ define <vscale x 8 x i32> @test_compress_large(<vscale x 8 x i32> %vec, <vscale
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: punpklo p2.h, p0.b
+; CHECK-NEXT: punpklo p1.h, p0.b
; CHECK-NEXT: cnth x9
-; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ptrue p2.s
; CHECK-NEXT: sub x9, x9, #1
; CHECK-NEXT: punpkhi p0.h, p0.b
-; CHECK-NEXT: compact z0.s, p2, z0.s
-; CHECK-NEXT: cntp x8, p1, p2.s
+; CHECK-NEXT: compact z0.s, p1, z0.s
+; CHECK-NEXT: cntp x8, p2, p1.s
; CHECK-NEXT: compact z1.s, p0, z1.s
; CHECK-NEXT: str z0, [sp]
-; CHECK-NEXT: mov w8, w8
; CHECK-NEXT: cmp x8, x9
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: st1w { z1.s }, p1, [x9, x8, lsl #2]
+; CHECK-NEXT: st1w { z1.s }, p2, [x9, x8, lsl #2]
; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #2
diff --git a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
index 62290c31f464d..568abe718ad9b 100644
--- a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
+++ b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
@@ -84,9 +84,8 @@ define i64 @cntp_nxv16i1_and_elimination(<vscale x 16 x i1> %p) {
; CHECK-LABEL: cntp_nxv16i1_and_elimination:
; CHECK: // %bb.0:
; CHECK-NEXT: cntp x8, p0, p0.b
-; CHECK-NEXT: and x9, x8, #0x1ff
-; CHECK-NEXT: and x8, x8, #0x3fffffffc
-; CHECK-NEXT: add x0, x9, x8
+; CHECK-NEXT: and x9, x8, #0x1fc
+; CHECK-NEXT: add x0, x8, x9
; CHECK-NEXT: ret
%cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv16i1(<vscale x 16 x i1> %p, <vscale x 16 x i1> %p)
%and_redundant = and i64 %cntp, 511
@@ -99,9 +98,8 @@ define i64 @cntp_nxv8i1_and_elimination(<vscale x 8 x i1> %p) {
; CHECK-LABEL: cntp_nxv8i1_and_elimination:
; CHECK: // %bb.0:
; CHECK-NEXT: cntp x8, p0, p0.h
-; CHECK-NEXT: and x9, x8, #0x3ff
-; CHECK-NEXT: and x8, x8, #0x3fffffffc
-; CHECK-NEXT: add x0, x9, x8
+; CHECK-NEXT: and x9, x8, #0xfc
+; CHECK-NEXT: add x0, x8, x9
; CHECK-NEXT: ret
%cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1> %p, <vscale x 8 x i1> %p)
%and_redundant = and i64 %cntp, 1023
@@ -114,9 +112,8 @@ define i64 @cntp_nxv4i1_and_elimination(<vscale x 4 x i1> %p) {
; CHECK-LABEL: cntp_nxv4i1_and_elimination:
; CHECK: // %bb.0:
; CHECK-NEXT: cntp x8, p0, p0.s
-; CHECK-NEXT: and x9, x8, #0x7f
-; CHECK-NEXT: and x8, x8, #0x3fffffffc
-; CHECK-NEXT: add x0, x9, x8
+; CHECK-NEXT: and x9, x8, #0x7c
+; CHECK-NEXT: add x0, x8, x9
; CHECK-NEXT: ret
%cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> %p, <vscale x 4 x i1> %p)
%and_redundant = and i64 %cntp, 127
@@ -129,9 +126,8 @@ define i64 @cntp_nxv2i1_and_elimination(<vscale x 2 x i1> %p) {
; CHECK-LABEL: cntp_nxv2i1_and_elimination:
; CHECK: // %bb.0:
; CHECK-NEXT: cntp x8, p0, p0.d
-; CHECK-NEXT: and x9, x8, #0x3f
-; CHECK-NEXT: and x8, x8, #0x3fffffffc
-; CHECK-NEXT: add x0, x9, x8
+; CHECK-NEXT: and x9, x8, #0x3c
+; CHECK-NEXT: add x0, x8, x9
; CHECK-NEXT: ret
%cntp = tail call i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1> %p, <vscale x 2 x i1> %p)
%and_redundant = and i64 %cntp, 63
>From 62c06b30709c3b33ebddba8c5ace85ca91e0781d Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 19 Nov 2025 16:21:39 +0000
Subject: [PATCH 3/4] Rework changes
---
.../Target/AArch64/AArch64ISelLowering.cpp | 89 ++++++++++---------
1 file changed, 48 insertions(+), 41 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 809c2af499958..48f3bf77851ab 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19443,45 +19443,50 @@ AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
return CSNeg;
}
-static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
+static bool IsSVECntIntrinsic(SDValue S) {
switch(getIntrinsicID(S.getNode())) {
default:
break;
case Intrinsic::aarch64_sve_cntb:
- return 8;
case Intrinsic::aarch64_sve_cnth:
- return 16;
case Intrinsic::aarch64_sve_cntw:
- return 32;
case Intrinsic::aarch64_sve_cntd:
- return 64;
+ case Intrinsic::aarch64_sve_cntp:
+ return true;
}
return {};
}
-// Returns the element size associated with an SVE cnt[bhwdp] intrinsic. For
-// cntp (predicate), the element size corresponds to the legal (packed) SVE
-// vector type associated with the predicate. E.g. nxv4i1 returns 32.
-static std::optional<unsigned> GetSVECntElementSize(SDValue Op) {
- if (auto ElementSize = IsSVECntIntrinsic(Op))
- return ElementSize;
+// Creates a constexpr (IID, VT) pair that can be used in switch cases.
+static constexpr uint64_t intrinsicWithType(Intrinsic::ID IID, MVT VT) {
+ static_assert(sizeof(VT.SimpleTy) <= sizeof(uint32_t) &&
+ sizeof(IID) <= sizeof(uint32_t),
+ "IID and MVT should fit in 64 bits");
+ return (uint64_t(IID) << 32) | uint64_t(VT.SimpleTy);
+}
+
+// Returns the maximum (scalable) value that can be returned by an SVE count
+// intrinsic. The supported intrinsics are covered by IsSVECntIntrinsic.
+static ElementCount getMaxValueForSVECntIntrinsic(SDValue Op) {
Intrinsic::ID IID = getIntrinsicID(Op.getNode());
- if (IID != Intrinsic::aarch64_sve_cntp)
- return {};
- EVT PredVT = Op.getOperand(Op.getNumOperands() - 1).getValueType();
- switch (PredVT.getSimpleVT().SimpleTy) {
- case MVT::nxv1i1:
- return 128;
- case MVT::nxv2i1:
- return 64;
- case MVT::nxv4i1:
- return 32;
- case MVT::nxv8i1:
- return 16;
- case MVT::nxv16i1:
- return 8;
+ MVT VT = IID == Intrinsic::aarch64_sve_cntp
+ ? Op.getOperand(1).getValueType().getSimpleVT()
+ : MVT::Untyped;
+ switch (intrinsicWithType(IID, VT)) {
+ case intrinsicWithType(Intrinsic::aarch64_sve_cntd, MVT::Untyped):
+ case intrinsicWithType(Intrinsic::aarch64_sve_cntp, MVT::nxv2i1):
+ return ElementCount::getScalable(2);
+ case intrinsicWithType(Intrinsic::aarch64_sve_cntw, MVT::Untyped):
+ case intrinsicWithType(Intrinsic::aarch64_sve_cntp, MVT::nxv4i1):
+ return ElementCount::getScalable(4);
+ case intrinsicWithType(Intrinsic::aarch64_sve_cnth, MVT::Untyped):
+ case intrinsicWithType(Intrinsic::aarch64_sve_cntp, MVT::nxv8i1):
+ return ElementCount::getScalable(8);
+ case intrinsicWithType(Intrinsic::aarch64_sve_cntb, MVT::Untyped):
+ case intrinsicWithType(Intrinsic::aarch64_sve_cntp, MVT::nxv16i1):
+ return ElementCount::getScalable(16);
default:
- llvm_unreachable("unexpected predicate type");
+ llvm_unreachable("unexpected intrininc type pair");
}
}
@@ -31692,22 +31697,24 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
return false;
}
case ISD::INTRINSIC_WO_CHAIN: {
- if (auto ElementSize = GetSVECntElementSize(Op)) {
- unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
- if (!MaxSVEVectorSizeInBits)
- MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
- unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
- // The SVE count intrinsics don't support the multiplier immediate so we
- // don't have to account for that here. The value returned may be slightly
- // over the true required bits, as this is based on the "ALL" pattern. The
- // other patterns are also exposed by these intrinsics, but they all
- // return a value that's strictly less than "ALL".
- unsigned RequiredBits = llvm::bit_width(MaxElements);
- unsigned BitWidth = Known.Zero.getBitWidth();
- if (RequiredBits < BitWidth)
- Known.Zero.setHighBits(BitWidth - RequiredBits);
+ if (!IsSVECntIntrinsic(Op))
return false;
- }
+ unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
+ if (!MaxSVEVectorSizeInBits)
+ MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
+ unsigned VscaleMax = MaxSVEVectorSizeInBits / 128;
+ unsigned MaxCount =
+ getMaxValueForSVECntIntrinsic(Op).getKnownMinValue() * VscaleMax;
+ // The SVE count intrinsics don't support the multiplier immediate so we
+ // don't have to account for that here. The value returned may be slightly
+ // over the true required bits, as this is based on the "ALL" pattern. The
+ // other patterns are also exposed by these intrinsics, but they all
+ // return a value that's strictly less than "ALL".
+ unsigned RequiredBits = llvm::bit_width(MaxCount);
+ unsigned BitWidth = Known.Zero.getBitWidth();
+ if (RequiredBits < BitWidth)
+ Known.Zero.setHighBits(BitWidth - RequiredBits);
+ return false;
}
}
>From d95927cac98d5ca4ccb4cf46b63bea90fd67d194 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 19 Nov 2025 16:58:47 +0000
Subject: [PATCH 4/4] Make less silly
---
.../Target/AArch64/AArch64ISelLowering.cpp | 31 ++++++-------------
1 file changed, 9 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 48f3bf77851ab..a938bf33505f2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19454,39 +19454,26 @@ static bool IsSVECntIntrinsic(SDValue S) {
case Intrinsic::aarch64_sve_cntp:
return true;
}
- return {};
-}
-
-// Creates a constexpr (IID, VT) pair that can be used in switch cases.
-static constexpr uint64_t intrinsicWithType(Intrinsic::ID IID, MVT VT) {
- static_assert(sizeof(VT.SimpleTy) <= sizeof(uint32_t) &&
- sizeof(IID) <= sizeof(uint32_t),
- "IID and MVT should fit in 64 bits");
- return (uint64_t(IID) << 32) | uint64_t(VT.SimpleTy);
+ return false;
}
// Returns the maximum (scalable) value that can be returned by an SVE count
// intrinsic. The supported intrinsics are covered by IsSVECntIntrinsic.
static ElementCount getMaxValueForSVECntIntrinsic(SDValue Op) {
Intrinsic::ID IID = getIntrinsicID(Op.getNode());
- MVT VT = IID == Intrinsic::aarch64_sve_cntp
- ? Op.getOperand(1).getValueType().getSimpleVT()
- : MVT::Untyped;
- switch (intrinsicWithType(IID, VT)) {
- case intrinsicWithType(Intrinsic::aarch64_sve_cntd, MVT::Untyped):
- case intrinsicWithType(Intrinsic::aarch64_sve_cntp, MVT::nxv2i1):
+ if (IID == Intrinsic::aarch64_sve_cntp)
+ return Op.getOperand(1).getValueType().getVectorElementCount();
+ switch (IID) {
+ case Intrinsic::aarch64_sve_cntd:
return ElementCount::getScalable(2);
- case intrinsicWithType(Intrinsic::aarch64_sve_cntw, MVT::Untyped):
- case intrinsicWithType(Intrinsic::aarch64_sve_cntp, MVT::nxv4i1):
+ case Intrinsic::aarch64_sve_cntw:
return ElementCount::getScalable(4);
- case intrinsicWithType(Intrinsic::aarch64_sve_cnth, MVT::Untyped):
- case intrinsicWithType(Intrinsic::aarch64_sve_cntp, MVT::nxv8i1):
+ case Intrinsic::aarch64_sve_cnth:
return ElementCount::getScalable(8);
- case intrinsicWithType(Intrinsic::aarch64_sve_cntb, MVT::Untyped):
- case intrinsicWithType(Intrinsic::aarch64_sve_cntp, MVT::nxv16i1):
+ case Intrinsic::aarch64_sve_cntb:
return ElementCount::getScalable(16);
default:
- llvm_unreachable("unexpected intrininc type pair");
+ llvm_unreachable("unexpected intrininc");
}
}
More information about the llvm-commits
mailing list