[compiler-rt] [llvm] Adjust and optimize CTPOP lowering (PR #181499)

Sat Feb 14 14:53:39 PST 2026

https://github.com/SiliconA-Z updated https://github.com/llvm/llvm-project/pull/181499

>From e9be47ac83461b79a7456c2f9921d9b870816304 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Sat, 14 Feb 2026 13:14:15 -0500
Subject: [PATCH] Adjust and optimize CTPOP lowering

Unify and correct popcount builtin implementations and optimize CTPOP lowering.

- compiler-rt: Simplify variable names in popcountdi2/popcountsi2/popcountti2, fix the sequence of shifts/adds and masks so byte-wise sums are accumulated correctly and final result is masked to 8 bits. Removes incorrect intermediate casts and comments and clarifies the per-stage widths.
- LLVM GlobalISel & SelectionDAG: Add cheap shift-add lowering for scalar 16- and 32-bit CTPOP to avoid multiplication when cheaper, use logical right shifts when summing bytes horizontally, and mask final results to 0xFF. Also fix the generic horizontal-sum path to use LShr instead of Shl and AND the final byte.

These changes address correctness in the popcount reductions and provide a more efficient lowering path for common small scalar sizes.
---
 compiler-rt/lib/builtins/popcountdi2.c        | 21 ++++++------
 compiler-rt/lib/builtins/popcountsi2.c        |  8 ++---
 compiler-rt/lib/builtins/popcountti2.c        | 33 +++++++++----------
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 14 +++++++-
 4 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/compiler-rt/lib/builtins/popcountdi2.c b/compiler-rt/lib/builtins/popcountdi2.c
index 20dd0b0239efc..662c501f136f8 100644
--- a/compiler-rt/lib/builtins/popcountdi2.c
+++ b/compiler-rt/lib/builtins/popcountdi2.c
@@ -15,18 +15,17 @@
 // Returns: count of 1 bits
 
 COMPILER_RT_ABI int __popcountdi2(di_int a) {
-  du_int x2 = (du_int)a;
-  x2 = x2 - ((x2 >> 1) & 0x5555555555555555uLL);
+  du_int x = (du_int)a;
+  x = x - ((x >> 1) & 0x5555555555555555uLL);
   // Every 2 bits holds the sum of every pair of bits (32)
-  x2 = ((x2 >> 2) & 0x3333333333333333uLL) + (x2 & 0x3333333333333333uLL);
+  x = ((x >> 2) & 0x3333333333333333uLL) + (x & 0x3333333333333333uLL);
   // Every 4 bits holds the sum of every 4-set of bits (3 significant bits) (16)
-  x2 = (x2 + (x2 >> 4)) & 0x0F0F0F0F0F0F0F0FuLL;
-  // Every 8 bits holds the sum of every 8-set of bits (4 significant bits) (8)
-  su_int x = (su_int)(x2 + (x2 >> 32));
-  // The lower 32 bits hold four 16 bit sums (5 significant bits).
-  //   Upper 32 bits are garbage
+  x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FuLL;
+  // Every 8 bits holds the sum of every 8-set of bits (4 significant bits)
+  x = x + (x >> 8);
+  // Every 16 bits holds the sum of every 16-set of bits (5 significant bits)
   x = x + (x >> 16);
-  // The lower 16 bits hold two 32 bit sums (6 significant bits).
-  //   Upper 16 bits are garbage
-  return (x + (x >> 8)) & 0x0000007F; // (7 significant bits)
+  // Every 32 bits holds the sum of every 32-set of bits (6 significant bits)
+  // Upper bits are garbage
+  return (x + (x >> 32)) & 0xFF; // (7 significant bits)
 }
diff --git a/compiler-rt/lib/builtins/popcountsi2.c b/compiler-rt/lib/builtins/popcountsi2.c
index 4d346c45d9cee..8c84046850fb3 100644
--- a/compiler-rt/lib/builtins/popcountsi2.c
+++ b/compiler-rt/lib/builtins/popcountsi2.c
@@ -22,8 +22,8 @@ COMPILER_RT_ABI int __popcountsi2(si_int a) {
   // Every 4 bits holds the sum of every 4-set of bits (3 significant bits)
   x = (x + (x >> 4)) & 0x0F0F0F0F;
   // Every 8 bits holds the sum of every 8-set of bits (4 significant bits)
-  x = (x + (x >> 16));
-  // The lower 16 bits hold two 8 bit sums (5 significant bits).
-  //    Upper 16 bits are garbage
-  return (x + (x >> 8)) & 0x0000003F; // (6 significant bits)
+  x = x + (x >> 8);
+  // Every 16 bits holds the sum of every 16-set of bits (5 significant bits)
+  // Upper bits are garbage
+  return (x + (x >> 16)) & 0xFF;
 }
diff --git a/compiler-rt/lib/builtins/popcountti2.c b/compiler-rt/lib/builtins/popcountti2.c
index 79cbb2fb34c00..4c69a7586cd30 100644
--- a/compiler-rt/lib/builtins/popcountti2.c
+++ b/compiler-rt/lib/builtins/popcountti2.c
@@ -18,26 +18,25 @@
 // Returns: count of 1 bits
 
 COMPILER_RT_ABI int __popcountti2(ti_int a) {
-  tu_int x3 = (tu_int)a;
-  x3 = x3 - ((x3 >> 1) &
-             (((tu_int)0x5555555555555555uLL << 64) | 0x5555555555555555uLL));
+  tu_int x = (tu_int)a;
+  x = x - ((x >> 1) &
+           (((tu_int)0x5555555555555555uLL << 64) | 0x5555555555555555uLL));
   // Every 2 bits holds the sum of every pair of bits (64)
-  x3 = ((x3 >> 2) &
-        (((tu_int)0x3333333333333333uLL << 64) | 0x3333333333333333uLL)) +
-       (x3 & (((tu_int)0x3333333333333333uLL << 64) | 0x3333333333333333uLL));
+  x = ((x >> 2) &
+       (((tu_int)0x3333333333333333uLL << 64) | 0x3333333333333333uLL)) +
+      (x & (((tu_int)0x3333333333333333uLL << 64) | 0x3333333333333333uLL));
   // Every 4 bits holds the sum of every 4-set of bits (3 significant bits) (32)
-  x3 = (x3 + (x3 >> 4)) &
-       (((tu_int)0x0F0F0F0F0F0F0F0FuLL << 64) | 0x0F0F0F0F0F0F0F0FuLL);
-  // Every 8 bits holds the sum of every 8-set of bits (4 significant bits) (16)
-  du_int x2 = (du_int)(x3 + (x3 >> 64));
-  // Every 8 bits holds the sum of every 8-set of bits (5 significant bits) (8)
-  su_int x = (su_int)(x2 + (x2 >> 32));
-  // Every 8 bits holds the sum of every 8-set of bits (6 significant bits) (4)
+  x = (x + (x >> 4)) &
+      (((tu_int)0x0F0F0F0F0F0F0F0FuLL << 64) | 0x0F0F0F0F0F0F0F0FuLL);
+  // Every 8 bits holds the sum of every 8-set of bits (4 significant bits)
+  x = x + (x >> 8);
+  // Every 16 bits holds the sum of every 16-set of bits (5 significant bits)
   x = x + (x >> 16);
-  // Every 8 bits holds the sum of every 8-set of bits (7 significant bits) (2)
-  //
-  // Upper 16 bits are garbage
-  return (x + (x >> 8)) & 0xFF; // (8 significant bits)
+  // Every 32 bits holds the sum of every 32-set of bits (6 significant bits)
+  x = x + (x >> 32);
+  // Every 64 bits holds the sum of every 64-set of bits (7 significant bits)
+  // Upper bits are garbage
+  return (x + (x >> 64)) & 0xFF;
 }
 
 #endif // CRT_HAS_128BIT
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 41b79ccb4e4d1..58e46f4f897e5 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7799,7 +7799,19 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) {
     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
 
-    assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
+    assert(Size <= 128 && "Scalar size is too large for CTPOP lower algorithm");
+
+    // Avoid the multiply when shift-add is cheaper.
+    if (Size == 16 && !Ty.isVector()) {
+      // v = (v + (v >> 8)) & 0xFF;
+      auto C_8 = B.buildConstant(Ty, 8);
+      auto HighSum = B.buildLShr(Ty, B8Count, C_8);
+      auto Res = B.buildAdd(Ty, B8Count, HighSum);
+      B.buildAnd(MI.getOperand(0).getReg(), Res, B.buildConstant(Ty, 0xFF));
+      MI.eraseFromParent();
+      return Legalized;
+    }
+
     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));