[compiler-rt] [llvm] Adjust and optimize CTPOP lowering (PR #181499)

Sat Feb 14 11:01:39 PST 2026

https://github.com/SiliconA-Z created https://github.com/llvm/llvm-project/pull/181499

Unify and correct popcount builtin implementations and optimize CTPOP lowering.

- compiler-rt: Simplify variable names in popcountdi2/popcountsi2/popcountti2, fix the sequence of shifts/adds and masks so byte-wise sums are accumulated correctly and final result is masked to 8 bits. Removes incorrect intermediate casts and comments and clarifies the per-stage widths.
- LLVM GlobalISel & SelectionDAG: Add cheap shift-add lowering for scalar 16- and 32-bit CTPOP to avoid multiplication when cheaper, use logical right shifts when summing bytes horizontally, and mask final results to 0xFF. Also fix the generic horizontal-sum path to use LShr instead of Shl and AND the final byte.

These changes address correctness in the popcount reductions and provide a more efficient lowering path for common small scalar sizes.

>From f7063ee3fdf558cd7186e6605048204d382532b2 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Sat, 14 Feb 2026 13:14:15 -0500
Subject: [PATCH] Adjust and optimize CTPOP lowering

Unify and correct popcount builtin implementations and optimize CTPOP lowering.

- compiler-rt: Simplify variable names in popcountdi2/popcountsi2/popcountti2, fix the sequence of shifts/adds and masks so byte-wise sums are accumulated correctly and final result is masked to 8 bits. Removes incorrect intermediate casts and comments and clarifies the per-stage widths.
- LLVM GlobalISel & SelectionDAG: Add cheap shift-add lowering for scalar 16- and 32-bit CTPOP to avoid multiplication when cheaper, use logical right shifts when summing bytes horizontally, and mask final results to 0xFF. Also fix the generic horizontal-sum path to use LShr instead of Shl and AND the final byte.

These changes address correctness in the popcount reductions and provide a more efficient lowering path for common small scalar sizes.
---
 compiler-rt/lib/builtins/popcountdi2.c        | 21 ++++++------
 compiler-rt/lib/builtins/popcountsi2.c        |  8 ++---
 compiler-rt/lib/builtins/popcountti2.c        | 33 +++++++++---------
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 34 ++++++++++++++++---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 33 ++++++++++++------
 5 files changed, 83 insertions(+), 46 deletions(-)

diff --git a/compiler-rt/lib/builtins/popcountdi2.c b/compiler-rt/lib/builtins/popcountdi2.c
index 20dd0b0239efc..662c501f136f8 100644
--- a/compiler-rt/lib/builtins/popcountdi2.c
+++ b/compiler-rt/lib/builtins/popcountdi2.c
@@ -15,18 +15,17 @@
 // Returns: count of 1 bits
 
 COMPILER_RT_ABI int __popcountdi2(di_int a) {
-  du_int x2 = (du_int)a;
-  x2 = x2 - ((x2 >> 1) & 0x5555555555555555uLL);
+  du_int x = (du_int)a;
+  x = x - ((x >> 1) & 0x5555555555555555uLL);
   // Every 2 bits holds the sum of every pair of bits (32)
-  x2 = ((x2 >> 2) & 0x3333333333333333uLL) + (x2 & 0x3333333333333333uLL);
+  x = ((x >> 2) & 0x3333333333333333uLL) + (x & 0x3333333333333333uLL);
   // Every 4 bits holds the sum of every 4-set of bits (3 significant bits) (16)
-  x2 = (x2 + (x2 >> 4)) & 0x0F0F0F0F0F0F0F0FuLL;
-  // Every 8 bits holds the sum of every 8-set of bits (4 significant bits) (8)
-  su_int x = (su_int)(x2 + (x2 >> 32));
-  // The lower 32 bits hold four 16 bit sums (5 significant bits).
-  //   Upper 32 bits are garbage
+  x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FuLL;
+  // Every 8 bits holds the sum of every 8-set of bits (4 significant bits)
+  x = x + (x >> 8);
+  // Every 16 bits holds the sum of every 16-set of bits (5 significant bits)
   x = x + (x >> 16);
-  // The lower 16 bits hold two 32 bit sums (6 significant bits).
-  //   Upper 16 bits are garbage
-  return (x + (x >> 8)) & 0x0000007F; // (7 significant bits)
+  // Every 32 bits holds the sum of every 32-set of bits (6 significant bits)
+  // Upper bits are garbage
+  return (x + (x >> 32)) & 0xFF; // (7 significant bits)
 }
diff --git a/compiler-rt/lib/builtins/popcountsi2.c b/compiler-rt/lib/builtins/popcountsi2.c
index 4d346c45d9cee..8c84046850fb3 100644
--- a/compiler-rt/lib/builtins/popcountsi2.c
+++ b/compiler-rt/lib/builtins/popcountsi2.c
@@ -22,8 +22,8 @@ COMPILER_RT_ABI int __popcountsi2(si_int a) {
   // Every 4 bits holds the sum of every 4-set of bits (3 significant bits)
   x = (x + (x >> 4)) & 0x0F0F0F0F;
   // Every 8 bits holds the sum of every 8-set of bits (4 significant bits)
-  x = (x + (x >> 16));
-  // The lower 16 bits hold two 8 bit sums (5 significant bits).
-  //    Upper 16 bits are garbage
-  return (x + (x >> 8)) & 0x0000003F; // (6 significant bits)
+  x = x + (x >> 8);
+  // Every 16 bits holds the sum of every 16-set of bits (5 significant bits)
+  // Upper bits are garbage
+  return (x + (x >> 16)) & 0xFF;
 }
diff --git a/compiler-rt/lib/builtins/popcountti2.c b/compiler-rt/lib/builtins/popcountti2.c
index 79cbb2fb34c00..4c69a7586cd30 100644
--- a/compiler-rt/lib/builtins/popcountti2.c
+++ b/compiler-rt/lib/builtins/popcountti2.c
@@ -18,26 +18,25 @@
 // Returns: count of 1 bits
 
 COMPILER_RT_ABI int __popcountti2(ti_int a) {
-  tu_int x3 = (tu_int)a;
-  x3 = x3 - ((x3 >> 1) &
-             (((tu_int)0x5555555555555555uLL << 64) | 0x5555555555555555uLL));
+  tu_int x = (tu_int)a;
+  x = x - ((x >> 1) &
+           (((tu_int)0x5555555555555555uLL << 64) | 0x5555555555555555uLL));
   // Every 2 bits holds the sum of every pair of bits (64)
-  x3 = ((x3 >> 2) &
-        (((tu_int)0x3333333333333333uLL << 64) | 0x3333333333333333uLL)) +
-       (x3 & (((tu_int)0x3333333333333333uLL << 64) | 0x3333333333333333uLL));
+  x = ((x >> 2) &
+       (((tu_int)0x3333333333333333uLL << 64) | 0x3333333333333333uLL)) +
+      (x & (((tu_int)0x3333333333333333uLL << 64) | 0x3333333333333333uLL));
   // Every 4 bits holds the sum of every 4-set of bits (3 significant bits) (32)
-  x3 = (x3 + (x3 >> 4)) &
-       (((tu_int)0x0F0F0F0F0F0F0F0FuLL << 64) | 0x0F0F0F0F0F0F0F0FuLL);
-  // Every 8 bits holds the sum of every 8-set of bits (4 significant bits) (16)
-  du_int x2 = (du_int)(x3 + (x3 >> 64));
-  // Every 8 bits holds the sum of every 8-set of bits (5 significant bits) (8)
-  su_int x = (su_int)(x2 + (x2 >> 32));
-  // Every 8 bits holds the sum of every 8-set of bits (6 significant bits) (4)
+  x = (x + (x >> 4)) &
+      (((tu_int)0x0F0F0F0F0F0F0F0FuLL << 64) | 0x0F0F0F0F0F0F0F0FuLL);
+  // Every 8 bits holds the sum of every 8-set of bits (4 significant bits)
+  x = x + (x >> 8);
+  // Every 16 bits holds the sum of every 16-set of bits (5 significant bits)
   x = x + (x >> 16);
-  // Every 8 bits holds the sum of every 8-set of bits (7 significant bits) (2)
-  //
-  // Upper 16 bits are garbage
-  return (x + (x >> 8)) & 0xFF; // (8 significant bits)
+  // Every 32 bits holds the sum of every 32-set of bits (6 significant bits)
+  x = x + (x >> 32);
+  // Every 64 bits holds the sum of every 64-set of bits (7 significant bits)
+  // Upper bits are garbage
+  return (x + (x >> 64)) & 0xFF;
 }
 
 #endif // CRT_HAS_128BIT
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 41b79ccb4e4d1..244e43337d1fa 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7799,7 +7799,33 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) {
     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
 
-    assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
+    assert(Size <= 128 && "Scalar size is too large for CTPOP lower algorithm");
+
+    // Avoid the multiply when shift-add is cheaper.
+    if (!Ty.isVector()) {
+      if (Size == 16) {
+        // v = (v + (v >> 8)) & 0xFF;
+        auto C_8 = B.buildConstant(Ty, 8);
+        auto HighSum = B.buildLShr(Ty, B8Count, C_8);
+        auto Res = B.buildAdd(Ty, B8Count, HighSum);
+        B.buildAnd(MI.getOperand(0).getReg(), Res, B.buildConstant(Ty, 0xFF));
+        MI.eraseFromParent();
+        return Legalized;
+      }
+      if (Size == 32) {
+        // v = v + (v >> 8);
+        // v = v + (v >> 16);
+        // return v & 0xFF;
+        auto C_8 = B.buildConstant(Ty, 8);
+        auto C_16 = B.buildConstant(Ty, 16);
+        auto Sum8 = B.buildAdd(Ty, B8Count, B.buildLShr(Ty, B8Count, C_8));
+        auto Sum16 = B.buildAdd(Ty, Sum8, B.buildLShr(Ty, Sum8, C_16));
+        B.buildAnd(MI.getOperand(0).getReg(), Sum16, B.buildConstant(Ty, 0xFF));
+        MI.eraseFromParent();
+        return Legalized;
+      }
+    }
+
     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
@@ -7818,10 +7844,10 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) {
       auto ResTmp = B8Count;
       for (unsigned Shift = 8; Shift < Size; Shift *= 2) {
         auto ShiftC = B.buildConstant(Ty, Shift);
-        auto Shl = B.buildShl(Ty, ResTmp, ShiftC);
-        ResTmp = B.buildAdd(Ty, ResTmp, Shl);
+        auto LShr = B.buildLShr(Ty, ResTmp, ShiftC);
+        ResTmp = B.buildAdd(Ty, ResTmp, LShr);
       }
-      B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
+      B.buildAnd(MI.getOperand(0).getReg(), ResTmp, B.buildConstant(Ty, 0xFF));
     }
     MI.eraseFromParent();
     return Legalized;
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e4b4d80896fa7..1e2b44443ce35 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9454,22 +9454,35 @@ SDValue TargetLowering::expandCTPOP(SDNode *Node, SelectionDAG &DAG) const {
                      DAG.getConstant(0xFF, dl, VT));
   }
 
+  if (Len == 32 && !VT.isVector()) {
+    // v = v + (v >> 8);
+    // v = v + (v >> 16);
+    // return v & 0x000000FF;
+    Op = DAG.getNode(
+        ISD::ADD, dl, VT, Op,
+        DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, ShVT)));
+    Op = DAG.getNode(
+        ISD::ADD, dl, VT, Op,
+        DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(16, dl, ShVT)));
+    return DAG.getNode(ISD::AND, dl, VT, Op, DAG.getConstant(0xFF, dl, VT));
+  }
+
   // v = (v * 0x01010101...) >> (Len - 8)
-  SDValue V;
   if (isOperationLegalOrCustomOrPromote(
           ISD::MUL, getTypeToTransformTo(*DAG.getContext(), VT))) {
     SDValue Mask01 =
         DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)), dl, VT);
-    V = DAG.getNode(ISD::MUL, dl, VT, Op, Mask01);
-  } else {
-    V = Op;
-    for (unsigned Shift = 8; Shift < Len; Shift *= 2) {
-      SDValue ShiftC = DAG.getShiftAmountConstant(Shift, VT, dl);
-      V = DAG.getNode(ISD::ADD, dl, VT, V,
-                      DAG.getNode(ISD::SHL, dl, VT, V, ShiftC));
-    }
+    SDValue V = DAG.getNode(ISD::MUL, dl, VT, Op, Mask01);
+    return DAG.getNode(ISD::SRL, dl, VT, V, DAG.getConstant(Len - 8, dl, ShVT));
+  }
+
+  // If multiply is not supported, sum the bytes horizontally into the LSB.
+  for (unsigned Shift = 8; Shift < Len; Shift *= 2) {
+    SDValue ShiftC = DAG.getShiftAmountConstant(Shift, VT, dl);
+    Op = DAG.getNode(ISD::ADD, dl, VT, Op,
+                     DAG.getNode(ISD::SRL, dl, VT, Op, ShiftC));
   }
-  return DAG.getNode(ISD::SRL, dl, VT, V, DAG.getConstant(Len - 8, dl, ShVT));
+  return DAG.getNode(ISD::AND, dl, VT, Op, DAG.getConstant(0xFF, dl, VT));
 }
 
 SDValue TargetLowering::expandVPCTPOP(SDNode *Node, SelectionDAG &DAG) const {