[llvm] 9c35a7b - [AArch64] Refine reduction VT selection in CTPOP -> VECREDUCE combine (#183025)

Thu Mar 5 01:02:24 PST 2026

Author: Benjamin Maxwell
Date: 2026-03-05T09:02:20Z
New Revision: 9c35a7bfbc832fe357d03cc801ec25a811e52dc9

URL: https://github.com/llvm/llvm-project/commit/9c35a7bfbc832fe357d03cc801ec25a811e52dc9
DIFF: https://github.com/llvm/llvm-project/commit/9c35a7bfbc832fe357d03cc801ec25a811e52dc9.diff

LOG: [AArch64] Refine reduction VT selection in CTPOP -> VECREDUCE combine (#183025)

Use the same VT as the SETcc source, or fall back to using the VT of the
unextended operand of the CTPOP if the element size of the SETcc is too
small to fit the negative popcount.

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/SDPatternMatch.h
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/popcount_vmask.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index b41443f85c3a9..62762d0c50d6a 100644

--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -332,19 +332,29 @@ inline SwitchContext<MatchContext, Pattern> m_Context(const MatchContext &Ctx,
 }
 
 // === Value type ===
-struct ValueType_bind {
+
+template <typename Pattern> struct ValueType_bind {
   EVT &BindVT;
+  Pattern P;
 
-  explicit ValueType_bind(EVT &Bind) : BindVT(Bind) {}
+  explicit ValueType_bind(EVT &Bind, const Pattern &P) : BindVT(Bind), P(P) {}
 
-  template <typename MatchContext> bool match(const MatchContext &, SDValue N) {
+  template <typename MatchContext>
+  bool match(const MatchContext &Ctx, SDValue N) {
     BindVT = N.getValueType();
-    return true;
+    return P.match(Ctx, N);
   }
 };
 
+template <typename Pattern>
+ValueType_bind(const Pattern &P) -> ValueType_bind<Pattern>;
+
 /// Retreive the ValueType of the current SDValue.
-inline ValueType_bind m_VT(EVT &VT) { return ValueType_bind(VT); }
+inline auto m_VT(EVT &VT) { return ValueType_bind(VT, m_Value()); }
+
+template <typename Pattern> inline auto m_VT(EVT &VT, const Pattern &P) {
+  return ValueType_bind(VT, P);
+}
 
 template <typename Pattern, typename PredFuncT> struct ValueType_match {
   PredFuncT PredFunc;

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8497a97aaf5fa..cd9de6c729649 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29255,8 +29255,10 @@ static SDValue performCTPOPCombine(SDNode *N,
     return SDValue();
 
   // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask))
+  EVT SrcVT;
   SDValue Mask;
-  if (!sd_match(N->getOperand(0), m_ZExtOrSelf(m_BitCast(m_Value(Mask)))))
+  if (!sd_match(N->getOperand(0),
+                m_ZExtOrSelf(m_VT(SrcVT, m_BitCast(m_Value(Mask))))))
     return SDValue();
 
   EVT VT = N->getValueType(0);
@@ -29266,14 +29268,24 @@ static SDValue performCTPOPCombine(SDNode *N,
       MaskVT.getVectorElementType() != MVT::i1)
     return SDValue();
 
-  EVT ReduceInVT =
-      EVT::getVectorVT(*DAG.getContext(), VT, MaskVT.getVectorElementCount());
+  EVT ReduceInVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
+                                    MaskVT.getVectorElementCount());
+
+  EVT CmpVT;
+  // Use the same VT as the SETcc if -CTPOP would not overflow.
+  if (sd_match(Mask, m_SetCC(m_VT(CmpVT), m_Value(), m_Value()))) {
+    CmpVT = CmpVT.changeVectorElementTypeToInteger();
+    if (Log2_64_Ceil(MaskVT.getSizeInBits()) <= CmpVT.getScalarSizeInBits() - 1)
+      ReduceInVT = CmpVT;
+  }
 
   SDLoc DL(N);
+  EVT PopVT = ReduceInVT.getScalarType();
   // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
   SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask);
-  SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, VT, ExtMask);
-  return DAG.getNegative(NegPopCount, DL, VT);
+  SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, PopVT, ExtMask);
+  SDValue ExtPopCount = DAG.getSExtOrTrunc(NegPopCount, DL, VT);
+  return DAG.getNegative(ExtPopCount, DL, VT);
 }
 
 static unsigned getReductionForOpcode(unsigned Op) {

diff  --git a/llvm/test/CodeGen/AArch64/popcount_vmask.ll b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
index 746b0bac0e5dc..57350d0fa06fa 100644
--- a/llvm/test/CodeGen/AArch64/popcount_vmask.ll
+++ b/llvm/test/CodeGen/AArch64/popcount_vmask.ll
@@ -7,9 +7,8 @@ define i32 @vmask_popcount_i32_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    smov w8, v0.b[0]
 ; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i8> %a, %b
@@ -23,13 +22,8 @@ define i32 @vmask_popcount_i32_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    smov w8, v0.b[0]
 ; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <16 x i8> %a, %b
@@ -43,8 +37,8 @@ define i32 @vmask_popcount_i32_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    saddlv s0, v0.4h
-; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    smov w8, v0.h[0]
 ; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <4 x i16> %a, %b
@@ -58,8 +52,8 @@ define i32 @vmask_popcount_i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    smov w8, v0.h[0]
 ; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i16> %a, %b
@@ -120,8 +114,7 @@ define i32 @vmask_popcount_i32_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vmask_popcount_i32_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
@@ -136,9 +129,8 @@ define i64 @vmask_popcount_i64_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v8i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    smov w8, v0.b[0]
 ; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i8> %a, %b
@@ -152,13 +144,8 @@ define i64 @vmask_popcount_i64_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    sshll2 v1.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
-; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    smov w8, v0.b[0]
 ; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <16 x i8> %a, %b
@@ -172,8 +159,8 @@ define i64 @vmask_popcount_i64_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v4i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    saddlv s0, v0.4h
-; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    addv h0, v0.4h
+; CHECK-NEXT:    smov w8, v0.h[0]
 ; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <4 x i16> %a, %b
@@ -187,8 +174,8 @@ define i64 @vmask_popcount_i64_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    saddlv s0, v0.8h
-; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    smov w8, v0.h[0]
 ; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
   %mask = icmp slt <8 x i16> %a, %b
@@ -249,8 +236,7 @@ define i64 @vmask_popcount_i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: vmask_popcount_i64_v2i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmgt v0.2d, v1.2d, v0.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-NEXT:    addp d0, v0.2d
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    neg w0, w8
 ; CHECK-NEXT:    ret
@@ -318,9 +304,8 @@ define i32 @vmask_negate_popcount(<16 x i8> %a, <16 x i8> %b)  {
 ; CHECK-LABEL: vmask_negate_popcount:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmeq v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    saddlp v0.8h, v0.16b
-; CHECK-NEXT:    addv h0, v0.8h
-; CHECK-NEXT:    smov w0, v0.h[0]
+; CHECK-NEXT:    addv b0, v0.16b
+; CHECK-NEXT:    smov w0, v0.b[0]
 ; CHECK-NEXT:    ret
   %mask = icmp eq <16 x i8> %a, %b
   %t1 = bitcast <16 x i1> %mask to i16
@@ -329,3 +314,22 @@ define i32 @vmask_negate_popcount(<16 x i8> %a, <16 x i8> %b)  {
   %t4 = sext i16 %t3 to i32
   ret i32 %t4
 }
+
+define i32 @vmask_popcount_i32_v8i1(<8 x i1> %a, <8 x i1> %b) {
+; CHECK-LABEL: vmask_popcount_i32_v8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.8b, #1
+; CHECK-NEXT:    eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    eor v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    shl v0.8b, v0.8b, #7
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT:    addv b0, v0.8b
+; CHECK-NEXT:    smov w8, v0.b[0]
+; CHECK-NEXT:    neg w0, w8
+; CHECK-NEXT:    ret
+  %mask = icmp eq <8 x i1> %a, %b
+  %t1 = bitcast <8 x i1> %mask to i8
+  %t2 = call i8 @llvm.ctpop(i8 %t1)
+  %t3 = zext i8 %t2 to i32
+  ret i32 %t3
+}