[llvm] [AggressiveInstCombine] POPCNT generation for bit-count pattern (PR #180917)

Wed Feb 11 03:15:32 PST 2026

https://github.com/rohitaggarwal007 created https://github.com/llvm/llvm-project/pull/180917

The proposal is to enhance LLVM by teaching it to recognize the pattern and replace it with the hardware POPCNT instruction.

#177109 has the first pattern of the popcnt fold.




>From b1ae92139795d550cc94d425432c90cab1501241 Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Date: Wed, 21 Jan 2026 12:52:41 +0530
Subject: [PATCH] [AggressiveInstCombine] POPCNT generation for bit-count
 pattern

---
 .../AggressiveInstCombine.cpp                 | 90 +++++++++++++++++++
 .../AggressiveInstCombine/popcount.ll         | 48 ++++++++++
 2 files changed, 138 insertions(+)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 3341368208c24..5ca2fa14e1533 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -372,6 +372,95 @@ static bool tryToRecognizePopCount(Instruction &I) {
   return false;
 }
 
+// Try to recognize below function as popcount intrinsic.
+// https://doc.lagout.org/security/Hackers%20Delight.pdf
+// Also used in TargetLowering::expandCTPOP().
+//
+// int popcnt(unsigned x) {
+// x = x - ((x >> 1) & 0x55555555);
+// x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+// x = (x + (x >> 4)) & 0x0F0F0F0F;
+// x = x + (x >> 8);
+// x = x + (x >> 16);
+// return x & 0x0000003F;
+// }
+
+// int popcnt(unsigned x) {
+// x = x - ((x >> 1) & 0x55555555);
+// x = x - 3*((x >> 2) & 0x33333333);
+// x = (x + (x >> 4)) & 0x0F0F0F0F;
+// x = x + (x >> 8);
+// x = x + (x >> 16);
+// return x & 0x0000003F;
+// }
+
+static bool tryToRecognizePopCount2n3(Instruction &I) {
+  if (I.getOpcode() != Instruction::And)
+    return false;
+
+  Type *Ty = I.getType();
+  if (!Ty->isIntOrIntVectorTy())
+    return false;
+
+  unsigned Len = Ty->getScalarSizeInBits();
+  if (!(Len <= 128 && Len > 8 && Len % 8 == 0))
+    return false;
+
+  Value *Op0 = I.getOperand(0);
+  Value *Op1 = I.getOperand(1);
+  Value *LShrOp0;
+  Value *AddOp1;
+  // Matching "x & 0x0000003F".
+  if ((match(Op0, m_Add(m_Value(LShrOp0), m_Value(AddOp1)))) &&
+      match(Op1, m_SpecificInt(63))) {
+    Value *LShr1;
+    Value *And1;
+    // Matching "x = x + (x >> 16)".
+    if (match(LShrOp0, m_LShr(m_Add(m_Value(LShr1), m_Value(And1)),
+                              m_SpecificInt(16)))) {
+      Value *Add2;
+      // Matching " x = x + (x >> 8)".
+      if (match(LShr1, m_LShr(m_Deferred(And1), m_SpecificInt(8))) &&
+          match(And1, m_c_And(m_Value(Add2), m_SpecificInt(252645135)))) {
+        Value *Add3;
+        // Matching "x = (x + (x >> 4)) & 0x0F0F0F0F".
+        if (match(Add2, m_c_Add(m_LShr(m_Value(Add3), m_SpecificInt(4)),
+                                m_Deferred(Add3)))) {
+          Value *Sub1;
+          llvm::APInt NegThree(/*BitWidth=*/32, /*Value=*/-3,
+                               /*isSigned=*/true);
+          // x = (x & 0x33333333) + ((x >> 2) & 0x33333333)".
+          if (match(Add3,
+                    m_c_Add(
+                        m_c_And(m_LShr(m_Value(Sub1), m_SpecificInt(2)),
+                                m_SpecificInt(858993459)),
+                        m_c_And(m_Deferred(Sub1), m_SpecificInt(858993459)))) ||
+              // Matching "x = x - 3*((x >> 2) & 0x33333333)".
+              match(Add3,
+                    m_Add(m_Mul(m_And(m_LShr(m_Value(Sub1), m_SpecificInt(2)),
+                                      m_SpecificInt(858993459)),
+                                m_SpecificInt(NegThree)),
+                          m_Deferred(Sub1)))) {
+            Value *Root;
+            if (match(Sub1,
+                      m_Sub(m_Value(Root),
+                            m_And(m_LShr(m_Deferred(Root), m_SpecificInt(1)),
+                                  m_SpecificInt(1431655765))))) {
+              LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
+              IRBuilder<> Builder(&I);
+              I.replaceAllUsesWith(Builder.CreateIntrinsic(
+                  Intrinsic::ctpop, I.getType(), {Root}));
+              ++NumPopCountRecognized;
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
 /// Fold smin(smax(fptosi(x), C1), C2) to llvm.fptosi.sat(x), providing C1 and
 /// C2 saturate the value of the fp conversion. The transform is not reversable
 /// as the fptosi.sat is more defined than the input - all values produce a
@@ -1826,6 +1915,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       MadeChange |= foldAnyOrAllBitsSet(I);
       MadeChange |= foldGuardedFunnelShift(I, DT);
       MadeChange |= tryToRecognizePopCount(I);
+      MadeChange |= tryToRecognizePopCount2n3(I);
       MadeChange |= tryToFPToSat(I, TTI);
       MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
       MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
diff --git a/llvm/test/Transforms/AggressiveInstCombine/popcount.ll b/llvm/test/Transforms/AggressiveInstCombine/popcount.ll
index f56cab1503531..2c9fde6608984 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/popcount.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/popcount.ll
@@ -239,3 +239,51 @@ define i32 @popcount64_mask(i64 %x) {
   %13 = trunc nuw nsw i64 %12 to i32
   ret i32 %13
 }
+
+define dso_local noundef range(i32 0, 64) i32 @popcnt2(i32 noundef %0) local_unnamed_addr {
+; CHECK-LABEL: define dso_local noundef range(i32 0, 64) i32 @popcnt2(
+; CHECK-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP0]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %2 = lshr i32 %0, 1
+  %3 = and i32 %2, 1431655765
+  %4 = sub i32 %0, %3
+  %5 = and i32 %4, 858993459
+  %6 = lshr i32 %4, 2
+  %7 = and i32 %6, 858993459
+  %8 = add nuw nsw i32 %7, %5
+  %9 = lshr i32 %8, 4
+  %10 = add nuw nsw i32 %9, %8
+  %11 = and i32 %10, 252645135
+  %12 = lshr i32 %11, 8
+  %13 = add nuw nsw i32 %12, %11
+  %14 = lshr i32 %13, 16
+  %15 = add nuw nsw i32 %14, %13
+  %16 = and i32 %15, 63
+  ret i32 %16
+}
+
+define dso_local noundef range(i32 0, 64) i32 @popcnt3(i32 noundef %0) local_unnamed_addr {
+; CHECK-LABEL: define dso_local noundef range(i32 0, 64) i32 @popcnt3(
+; CHECK-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP0]])
+; CHECK-NEXT:    ret i32 [[TMP16]]
+;
+  %2 = lshr i32 %0, 1
+  %3 = and i32 %2, 1431655765
+  %4 = sub i32 %0, %3
+  %5 = lshr i32 %4, 2
+  %6 = and i32 %5, 858993459
+  %7 = mul i32 %6, -3
+  %8 = add i32 %7, %4
+  %9 = lshr i32 %8, 4
+  %10 = add i32 %9, %8
+  %11 = and i32 %10, 252645135
+  %12 = lshr i32 %11, 8
+  %13 = add nuw nsw i32 %12, %11
+  %14 = lshr i32 %13, 16
+  %15 = add nuw nsw i32 %14, %13
+  %16 = and i32 %15, 63
+  ret i32 %16
+}