[llvm] [AArch64] Refine reduction VT selection in CTPOP -> VECREDUCE combine (PR #183025)

Wed Mar 4 09:56:36 PST 2026

https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/183025

>From 78c716a011eecf3626d6b82e8d2064a5bee6d393 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 24 Feb 2026 10:15:31 +0000
Subject: [PATCH 1/4] [AArch64] Refine reduction VT selection in CTPOP ->
 VECREDUCE combine

Use the same VT as the SETcc source, or fall back to using the VT of
the unextended operand of the CTPOP if the element size of the SETcc
is too small to fit the negative popcount.
---
 llvm/include/llvm/CodeGen/SDPatternMatch.h    | 20 +++-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 22 ++++-
 llvm/test/CodeGen/AArch64/popcount_vmask.ll   | 99 +++++++++++--------
 3 files changed, 88 insertions(+), 53 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index b41443f85c3a9..62762d0c50d6a 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -332,19 +332,29 @@ inline SwitchContext<MatchContext, Pattern> m_Context(const MatchContext &Ctx,
 }
 
 // === Value type ===
-struct ValueType_bind {
+
+template <typename Pattern> struct ValueType_bind {
   EVT &BindVT;
+  Pattern P;
 
-  explicit ValueType_bind(EVT &Bind) : BindVT(Bind) {}
+  explicit ValueType_bind(EVT &Bind, const Pattern &P) : BindVT(Bind), P(P) {}
 
-  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
     BindVT = N.getValueType();
-    return true;
+    return P.match(Ctx, N);
   }
 };
 
+template <typename Pattern>
+ValueType_bind(const Pattern &P) -> ValueType_bind<Pattern>;
+
 /// Retreive the ValueType of the current SDValue.
-inline ValueType_bind m_VT(EVT &VT) { return ValueType_bind(VT); }
+inline auto m_VT(EVT &VT) { return ValueType_bind(VT, m_Value()); }
+
+template <typename Pattern> inline auto m_VT(EVT &VT, const Pattern &P) {
+  return ValueType_bind(VT, P);
+}
 
 template <typename Pattern, typename PredFuncT> struct ValueType_match {
   PredFuncT PredFunc;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2cd78493d2c23..501e8e1b9d84b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29212,8 +29212,10 @@ static SDValue performCTPOPCombine(SDNode *N,
     return SDValue();
 
   // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask))
+  EVT SrcVT;
   SDValue Mask;
-  if (!sd_match(N->getOperand(0), m_ZExtOrSelf(m_BitCast(m_Value(Mask)))))
+  if (!sd_match(N->getOperand(0),
+                m_ZExtOrSelf(m_VT(SrcVT, m_BitCast(m_Value(Mask))))))
     return SDValue();
 
   EVT VT = N->getValueType(0);
@@ -29223,14 +29225,24 @@ static SDValue performCTPOPCombine(SDNode *N,
       MaskVT.getVectorElementType() != MVT::i1)
     return SDValue();
 
-  EVT ReduceInVT =
-      EVT::getVectorVT(*DAG.getContext(), VT, MaskVT.getVectorElementCount());
+  EVT ReduceInVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
+                                    MaskVT.getVectorElementCount());
+
+  EVT CmpVT;
+  // Use the same VT as the SETcc if the -CTPOP would not overflow.
+  if (sd_match(Mask, m_SetCC(m_VT(CmpVT), m_Value(), m_Value()))) {
+    CmpVT = CmpVT.changeVectorElementTypeToInteger();
+    if (MaskVT.getSizeInBits() <= (1 << (CmpVT.getScalarSizeInBits() - 1)))
+      ReduceInVT = CmpVT;
+  }
 
   SDLoc DL(N);
+  EVT PopVT = ReduceInVT.getScalarType();
   // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
   SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask);
-  SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask);
-  return DAG.getNegative(NegPopCount, DL, VT);
+  SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, PopVT, ExtMask);
+  SDValue ExtPopCount = DAG.getSExtOrTrunc(NegPopCount, DL, VT);
+  return DAG.getNegative(ExtPopCount, DL, VT);
 }
 
 static unsigned getReductionForOpcode(unsigned Op) {
diff --git a/llvm/test/CodeGen/AArch64/popcount_vmask.ll b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
index 746b0bac0e5dc..5896a79bf7f64 100644
--- a/llvm/test/CodeGen/AArch64/popcount_vmask.ll
+++ b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
@@ -7,10 +7,10 @@ define i32 @vmask_popcount_i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub w0, w8, w9, sxtb
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i8> %a, %b
   %t1 = bitcast <8 x i1> %mask to i8
@@ -23,14 +23,10 @@ define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub w0, w8, w9, sxtb
 ; CHECK-NEXT:    ret
   %mask = icmp slt <16 x i8> %a, %b
   %t1 = bitcast <16 x i1> %mask to i16
@@ -43,9 +39,10 @@ define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    saddlv s0, v0.4h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub w0, w8, w9, sxth
 ; CHECK-NEXT:    ret
   %mask = icmp slt <4 x i16> %a, %b
   %t1 = bitcast <4 x i1> %mask to i4
@@ -58,9 +55,10 @@ define i32 @vmask_popcount_i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub w0, w8, w9, sxth
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i16> %a, %b
   %t1 = bitcast <8 x i1> %mask to i8
@@ -120,8 +118,7 @@ define i32 @vmask_popcount_i32_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
@@ -136,10 +133,10 @@ define i64 @vmask_popcount_i64_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub w0, w8, w9, sxtb
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i8> %a, %b
   %t1 = bitcast <8 x i1> %mask to i8
@@ -152,14 +149,10 @@ define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub w0, w8, w9, sxtb
 ; CHECK-NEXT:    ret
   %mask = icmp slt <16 x i8> %a, %b
   %t1 = bitcast <16 x i1> %mask to i16
@@ -172,9 +165,10 @@ define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    saddlv s0, v0.4h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub w0, w8, w9, sxth
 ; CHECK-NEXT:    ret
   %mask = icmp slt <4 x i16> %a, %b
   %t1 = bitcast <4 x i1> %mask to i4
@@ -187,9 +181,10 @@ define i64 @vmask_popcount_i64_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub w0, w8, w9, sxth
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i16> %a, %b
   %t1 = bitcast <8 x i1> %mask to i8
@@ -249,8 +244,7 @@ define i64 @vmask_popcount_i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
@@ -318,9 +312,8 @@ define i32 @vmask_negate_popcount(<16 x i8> %a, <16 x i8> %b)  {
 ; CHECK-LABEL: vmask_negate_popcount:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmeq v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    saddlp v0.8h, v0.16b
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    smov w0, v0.h[0]
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    smov w0, v0.b[0]
 ; CHECK-NEXT:    ret
   %mask = icmp eq <16 x i8> %a, %b
   %t1 = bitcast <16 x i1> %mask to i16
@@ -329,3 +322,23 @@ define i32 @vmask_negate_popcount(<16 x i8> %a, <16 x i8> %b)  {
   %t4 = sext i16 %t3 to i32
   ret i32 %t4
 }
+
+define i32 @vmask_popcount_i32_v8i1(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.8b, #1
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    eor v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    shl v0.8b, v0.8b, #7
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    sub w0, w8, w9, sxtb
+; CHECK-NEXT:    ret
+  %mask = icmp eq <8 x i1> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i32
+  ret i32 %t3
+}

>From 6c4d56685efb4f6ae58817d6a862cfbd6fe2c03d Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 24 Feb 2026 10:32:16 +0000
Subject: [PATCH 2/4] Fix typo

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 501e8e1b9d84b..0f59f65b74c34 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29229,7 +29229,7 @@ static SDValue performCTPOPCombine(SDNode *N,
                                     MaskVT.getVectorElementCount());
 
   EVT CmpVT;
-  // Use the same VT as the SETcc if the -CTPOP would not overflow.
+  // Use the same VT as the SETcc if -CTPOP would not overflow.
   if (sd_match(Mask, m_SetCC(m_VT(CmpVT), m_Value(), m_Value()))) {
     CmpVT = CmpVT.changeVectorElementTypeToInteger();
     if (MaskVT.getSizeInBits() <= (1 << (CmpVT.getScalarSizeInBits() - 1)))

>From e60ade9c70d84e3383c2e27f1e3d3a9a73d8b076 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 4 Mar 2026 10:26:54 +0000
Subject: [PATCH 3/4] Rebase tests

---
 llvm/test/CodeGen/AArch64/popcount_vmask.ll | 45 +++++++++------------
 1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/popcount_vmask.ll b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
index 5896a79bf7f64..57350d0fa06fa 100644
--- a/llvm/test/CodeGen/AArch64/popcount_vmask.ll
+++ b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
@@ -7,10 +7,9 @@ define i32 @vmask_popcount_i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    addv b0, v0.8b
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub w0, w8, w9, sxtb
+; CHECK-NEXT:    smov w8, v0.b[0]
+; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i8> %a, %b
   %t1 = bitcast <8 x i1> %mask to i8
@@ -23,10 +22,9 @@ define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub w0, w8, w9, sxtb
+; CHECK-NEXT:    smov w8, v0.b[0]
+; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <16 x i8> %a, %b
   %t1 = bitcast <16 x i1> %mask to i16
@@ -39,10 +37,9 @@ define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    addv h0, v0.4h
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub w0, w8, w9, sxth
+; CHECK-NEXT:    smov w8, v0.h[0]
+; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <4 x i16> %a, %b
   %t1 = bitcast <4 x i1> %mask to i4
@@ -55,10 +52,9 @@ define i32 @vmask_popcount_i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub w0, w8, w9, sxth
+; CHECK-NEXT:    smov w8, v0.h[0]
+; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i16> %a, %b
   %t1 = bitcast <8 x i1> %mask to i8
@@ -133,10 +129,9 @@ define i64 @vmask_popcount_i64_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    addv b0, v0.8b
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub w0, w8, w9, sxtb
+; CHECK-NEXT:    smov w8, v0.b[0]
+; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i8> %a, %b
   %t1 = bitcast <8 x i1> %mask to i8
@@ -149,10 +144,9 @@ define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    addv b0, v0.16b
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub w0, w8, w9, sxtb
+; CHECK-NEXT:    smov w8, v0.b[0]
+; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <16 x i8> %a, %b
   %t1 = bitcast <16 x i1> %mask to i16
@@ -165,10 +159,9 @@ define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    addv h0, v0.4h
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub w0, w8, w9, sxth
+; CHECK-NEXT:    smov w8, v0.h[0]
+; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <4 x i16> %a, %b
   %t1 = bitcast <4 x i1> %mask to i4
@@ -181,10 +174,9 @@ define i64 @vmask_popcount_i64_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub w0, w8, w9, sxth
+; CHECK-NEXT:    smov w8, v0.h[0]
+; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i16> %a, %b
   %t1 = bitcast <8 x i1> %mask to i8
@@ -328,13 +320,12 @@ define i32 @vmask_popcount_i32_v8i1(<8 x i1> %a, <8 x i1> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.8b, #1
 ; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    eor v0.8b, v0.8b, v2.8b
 ; CHECK-NEXT:    shl v0.8b, v0.8b, #7
 ; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
 ; CHECK-NEXT:    addv b0, v0.8b
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    sub w0, w8, w9, sxtb
+; CHECK-NEXT:    smov w8, v0.b[0]
+; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp eq <8 x i1> %a, %b
   %t1 = bitcast <8 x i1> %mask to i8

>From a8da30b2c1786c772337d957ad8947baf021d137 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Wed, 4 Mar 2026 17:55:47 +0000
Subject: [PATCH 4/4] Fixups

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0f59f65b74c34..47bea0ba795e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29232,7 +29232,7 @@ static SDValue performCTPOPCombine(SDNode *N,
   // Use the same VT as the SETcc if -CTPOP would not overflow.
   if (sd_match(Mask, m_SetCC(m_VT(CmpVT), m_Value(), m_Value()))) {
     CmpVT = CmpVT.changeVectorElementTypeToInteger();
-    if (MaskVT.getSizeInBits() <= (1 << (CmpVT.getScalarSizeInBits() - 1)))
+    if (Log2_64_Ceil(MaskVT.getSizeInBits()) <= CmpVT.getScalarSizeInBits() - 1)
       ReduceInVT = CmpVT;
   }