[compiler-rt] [llvm] Adjust and optimize CTPOP lowering (PR #181499)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 14 14:53:39 PST 2026
https://github.com/SiliconA-Z updated https://github.com/llvm/llvm-project/pull/181499
>From e9be47ac83461b79a7456c2f9921d9b870816304 Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Sat, 14 Feb 2026 13:14:15 -0500
Subject: [PATCH] Adjust and optimize CTPOP lowering
Unify and correct popcount builtin implementations and optimize CTPOP lowering.
- compiler-rt: Simplify variable names in popcountdi2/popcountsi2/popcountti2, fix the sequence of shifts/adds and masks so byte-wise sums are accumulated correctly and final result is masked to 8 bits. Removes incorrect intermediate casts and comments and clarifies the per-stage widths.
- LLVM GlobalISel & SelectionDAG: Add cheap shift-add lowering for scalar 16- and 32-bit CTPOP to avoid multiplication when cheaper, use logical right shifts when summing bytes horizontally, and mask final results to 0xFF. Also fix the generic horizontal-sum path to use LShr instead of Shl and AND the final byte.
These changes address correctness in the popcount reductions and provide a more efficient lowering path for common small scalar sizes.
---
compiler-rt/lib/builtins/popcountdi2.c | 21 ++++++------
compiler-rt/lib/builtins/popcountsi2.c | 8 ++---
compiler-rt/lib/builtins/popcountti2.c | 33 +++++++++----------
.../CodeGen/GlobalISel/LegalizerHelper.cpp | 14 +++++++-
4 files changed, 43 insertions(+), 33 deletions(-)
diff --git a/compiler-rt/lib/builtins/popcountdi2.c b/compiler-rt/lib/builtins/popcountdi2.c
index 20dd0b0239efc..662c501f136f8 100644
--- a/compiler-rt/lib/builtins/popcountdi2.c
+++ b/compiler-rt/lib/builtins/popcountdi2.c
@@ -15,18 +15,17 @@
// Returns: count of 1 bits
COMPILER_RT_ABI int __popcountdi2(di_int a) {
- du_int x2 = (du_int)a;
- x2 = x2 - ((x2 >> 1) & 0x5555555555555555uLL);
+ du_int x = (du_int)a;
+ x = x - ((x >> 1) & 0x5555555555555555uLL);
// Every 2 bits holds the sum of every pair of bits (32)
- x2 = ((x2 >> 2) & 0x3333333333333333uLL) + (x2 & 0x3333333333333333uLL);
+ x = ((x >> 2) & 0x3333333333333333uLL) + (x & 0x3333333333333333uLL);
// Every 4 bits holds the sum of every 4-set of bits (3 significant bits) (16)
- x2 = (x2 + (x2 >> 4)) & 0x0F0F0F0F0F0F0F0FuLL;
- // Every 8 bits holds the sum of every 8-set of bits (4 significant bits) (8)
- su_int x = (su_int)(x2 + (x2 >> 32));
- // The lower 32 bits hold four 16 bit sums (5 significant bits).
- // Upper 32 bits are garbage
+ x = (x + (x >> 4)) & 0x0F0F0F0F0F0F0F0FuLL;
+ // Every 8 bits holds the sum of every 8-set of bits (4 significant bits)
+ x = x + (x >> 8);
+ // Every 16 bits holds the sum of every 16-set of bits (5 significant bits)
x = x + (x >> 16);
- // The lower 16 bits hold two 32 bit sums (6 significant bits).
- // Upper 16 bits are garbage
- return (x + (x >> 8)) & 0x0000007F; // (7 significant bits)
+ // Every 32 bits holds the sum of every 32-set of bits (6 significant bits)
+ // Upper bits are garbage
+ return (x + (x >> 32)) & 0xFF; // (7 significant bits)
}
diff --git a/compiler-rt/lib/builtins/popcountsi2.c b/compiler-rt/lib/builtins/popcountsi2.c
index 4d346c45d9cee..8c84046850fb3 100644
--- a/compiler-rt/lib/builtins/popcountsi2.c
+++ b/compiler-rt/lib/builtins/popcountsi2.c
@@ -22,8 +22,8 @@ COMPILER_RT_ABI int __popcountsi2(si_int a) {
// Every 4 bits holds the sum of every 4-set of bits (3 significant bits)
x = (x + (x >> 4)) & 0x0F0F0F0F;
// Every 8 bits holds the sum of every 8-set of bits (4 significant bits)
- x = (x + (x >> 16));
- // The lower 16 bits hold two 8 bit sums (5 significant bits).
- // Upper 16 bits are garbage
- return (x + (x >> 8)) & 0x0000003F; // (6 significant bits)
+ x = x + (x >> 8);
+ // Every 16 bits holds the sum of every 16-set of bits (5 significant bits)
+ // Upper bits are garbage
+ return (x + (x >> 16)) & 0xFF;
}
diff --git a/compiler-rt/lib/builtins/popcountti2.c b/compiler-rt/lib/builtins/popcountti2.c
index 79cbb2fb34c00..4c69a7586cd30 100644
--- a/compiler-rt/lib/builtins/popcountti2.c
+++ b/compiler-rt/lib/builtins/popcountti2.c
@@ -18,26 +18,25 @@
// Returns: count of 1 bits
COMPILER_RT_ABI int __popcountti2(ti_int a) {
- tu_int x3 = (tu_int)a;
- x3 = x3 - ((x3 >> 1) &
- (((tu_int)0x5555555555555555uLL << 64) | 0x5555555555555555uLL));
+ tu_int x = (tu_int)a;
+ x = x - ((x >> 1) &
+ (((tu_int)0x5555555555555555uLL << 64) | 0x5555555555555555uLL));
// Every 2 bits holds the sum of every pair of bits (64)
- x3 = ((x3 >> 2) &
- (((tu_int)0x3333333333333333uLL << 64) | 0x3333333333333333uLL)) +
- (x3 & (((tu_int)0x3333333333333333uLL << 64) | 0x3333333333333333uLL));
+ x = ((x >> 2) &
+ (((tu_int)0x3333333333333333uLL << 64) | 0x3333333333333333uLL)) +
+ (x & (((tu_int)0x3333333333333333uLL << 64) | 0x3333333333333333uLL));
// Every 4 bits holds the sum of every 4-set of bits (3 significant bits) (32)
- x3 = (x3 + (x3 >> 4)) &
- (((tu_int)0x0F0F0F0F0F0F0F0FuLL << 64) | 0x0F0F0F0F0F0F0F0FuLL);
- // Every 8 bits holds the sum of every 8-set of bits (4 significant bits) (16)
- du_int x2 = (du_int)(x3 + (x3 >> 64));
- // Every 8 bits holds the sum of every 8-set of bits (5 significant bits) (8)
- su_int x = (su_int)(x2 + (x2 >> 32));
- // Every 8 bits holds the sum of every 8-set of bits (6 significant bits) (4)
+ x = (x + (x >> 4)) &
+ (((tu_int)0x0F0F0F0F0F0F0F0FuLL << 64) | 0x0F0F0F0F0F0F0F0FuLL);
+ // Every 8 bits holds the sum of every 8-set of bits (4 significant bits)
+ x = x + (x >> 8);
+ // Every 16 bits holds the sum of every 16-set of bits (5 significant bits)
x = x + (x >> 16);
- // Every 8 bits holds the sum of every 8-set of bits (7 significant bits) (2)
- //
- // Upper 16 bits are garbage
- return (x + (x >> 8)) & 0xFF; // (8 significant bits)
+ // Every 32 bits holds the sum of every 32-set of bits (6 significant bits)
+ x = x + (x >> 32);
+ // Every 64 bits holds the sum of every 64-set of bits (7 significant bits)
+ // Upper bits are garbage
+ return (x + (x >> 64)) & 0xFF;
}
#endif // CRT_HAS_128BIT
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 41b79ccb4e4d1..58e46f4f897e5 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7799,7 +7799,19 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI) {
auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
- assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
+ assert(Size <= 128 && "Scalar size is too large for CTPOP lower algorithm");
+
+ // Avoid the multiply when shift-add is cheaper.
+ if (Size == 16 && !Ty.isVector()) {
+ // v = (v + (v >> 8)) & 0xFF;
+ auto C_8 = B.buildConstant(Ty, 8);
+ auto HighSum = B.buildLShr(Ty, B8Count, C_8);
+ auto Res = B.buildAdd(Ty, B8Count, HighSum);
+ B.buildAnd(MI.getOperand(0).getReg(), Res, B.buildConstant(Ty, 0xFF));
+ MI.eraseFromParent();
+ return Legalized;
+ }
+
// 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
// bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
More information about the llvm-commits
mailing list