[llvm] [AggressiveInstCombine] Match the pattern and generate ctlz function call (PR #177110)

Tue Jan 20 23:47:05 PST 2026

https://github.com/rohitaggarwal007 created https://github.com/llvm/llvm-project/pull/177110

Recognize the algorithm using the DeBruijnClz value to count the leading zero and replace it hardware ctlz call.

>From b5001efa5b78af797c390414d8bce94bfdf7adce Mon Sep 17 00:00:00 2001
From: Rohit Aggarwal <Rohit.Aggarwal at amd.com>
Date: Wed, 21 Jan 2026 13:08:03 +0530
Subject: [PATCH] [AggressiveInstCombine] Match the pattern and generate ctlz
 function call

---
 .../AggressiveInstCombine.cpp                 | 159 ++++++++++++++++++
 .../lower-table-based-ctlz.ll                 |  42 +++++
 2 files changed, 201 insertions(+)
 create mode 100644 llvm/test/Transforms/AggressiveInstCombine/lower-table-based-ctlz.ll

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 3341368208c24..3733f13d0cc0d 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -482,6 +482,48 @@ static bool isCTTZTable(Constant *Table, const APInt &Mul, const APInt &Shift,
   return true;
 }
 
+struct CtlzConstants {
+  uint64_t Mul;
+  uint64_t ShiftConst1;
+  uint64_t ShiftConst2;
+  uint64_t ShiftConst3;
+  uint64_t ShiftConst4;
+  uint64_t ShiftConst5;
+  uint64_t ShiftConst6;
+};
+
+// Check if this array of constants represents a ctlz table.
+// Iterate over the elements from \p Table by trying to find/match all
+// the numbers from 0 to \p InputBits that should represent ctlz results.
+static bool isCTLZTable(const ConstantDataArray &Table,
+                        CtlzConstants &TableConstant, uint64_t InputBits) {
+  static const uint32_t DeBruijnClz[32] = {
+      0, 9,  1,  10, 13, 21, 2,  29, 11, 14, 16, 18, 22, 25, 3, 30,
+      8, 12, 20, 28, 15, 17, 24, 7,  19, 27, 23, 6,  26, 5,  4, 31};
+
+  if (TableConstant.Mul != 130329821 || TableConstant.ShiftConst5 != 16 ||
+      TableConstant.ShiftConst4 != 8 || TableConstant.ShiftConst3 != 4 ||
+      TableConstant.ShiftConst2 != 2 || TableConstant.ShiftConst1 != 1)
+    return false;
+
+  unsigned Length = Table.getNumElements();
+  if (Length < InputBits || Length > InputBits * 2)
+    return false;
+
+  APInt Mask = APInt::getBitsSetFrom(InputBits, TableConstant.ShiftConst6);
+  unsigned Matched = 0;
+
+  for (unsigned i = 0; i < Length; i++) {
+    uint64_t Element = Table.getElementAsInteger(i);
+    if (Element >= InputBits)
+      continue;
+
+    if (DeBruijnClz[i] == Element)
+      Matched++;
+  }
+  return Matched == InputBits;
+}
+
 // Try to recognize table-based ctz implementation.
 // E.g., an example in C (for more cases please see the llvm/tests):
 // int f(unsigned x) {
@@ -626,6 +668,122 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
   return true;
 }
 
+// Try to recognize table-based ctlz implementation.
+// E.g., an example in C (for more cases please see the llvm/tests):
+// int f(unsigned val) {
+// assert(val != 0);
+// {
+//     static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
+//                                         11, 14, 16, 18, 22, 25, 3, 30,
+//                                         8, 12, 20, 28, 15, 17, 24, 7,
+//                                         19, 27, 23, 6, 26, 5, 4, 31};
+//     val |= val >> 1;
+//     val |= val >> 2;
+//     val |= val >> 4;
+//     val |= val >> 8;
+//     val |= val >> 16;
+//     return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
+// }
+// }
+// this can be lowered to `ctlz` instruction.
+// There is also a special case when the element is 0.
+//
+// All this can be lowered to @llvm.ctlz.i32/64 intrinsic.
+static bool tryToRecognizeTableBasedCtlz(Instruction &I) {
+  uint64_t SubConst;
+  Value *LV;
+  if (!match(&I, m_Sub(m_ConstantInt(SubConst), m_Value(LV)))) {
+    return false;
+  }
+  LoadInst *LI = dyn_cast<LoadInst>(LV);
+  if (!LI)
+    return false;
+
+  Type *AccessType = LI->getType();
+  if (!AccessType->isIntegerTy())
+    return false;
+
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
+  if (!GEP || !GEP->isInBounds() || GEP->getNumIndices() != 2)
+    return false;
+
+  if (!GEP->getSourceElementType()->isArrayTy())
+    return false;
+
+  uint64_t ArraySize = GEP->getSourceElementType()->getArrayNumElements();
+  if (ArraySize != 32)
+    return false;
+
+  GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
+  if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
+    return false;
+
+  ConstantDataArray *ConstData =
+      dyn_cast<ConstantDataArray>(GVTable->getInitializer());
+  if (!ConstData)
+    return false;
+
+  if (!match(GEP->idx_begin()->get(), m_ZeroInt()))
+    return false;
+
+  Value *Idx2 = std::next(GEP->idx_begin())->get();
+  uint64_t MulConst, ShiftConst6, ShiftConst5, ShiftConst4, ShiftConst3,
+      ShiftConst2, ShiftConst1, ShiftConst0;
+  Value *V1, *V2, *V3, *V4, *V5, *V6, *V7, *V8, *V9, *V10;
+  if (!(match(Idx2, m_ZExtOrSelf(m_LShr(m_Mul(m_Or(m_Value(V1), m_Value(V2)),
+                                              m_ConstantInt(MulConst)),
+                                        m_ConstantInt(ShiftConst6)))) &&
+        match(V1, m_LShr(m_Specific(V2), m_ConstantInt(ShiftConst5))) &&
+        match(V2, m_Or(m_Value(V3), m_Value(V4))) &&
+        match(V3, m_LShr(m_Specific(V4), m_ConstantInt(ShiftConst4))) &&
+        match(V4, m_Or(m_Value(V5), m_Value(V6))) &&
+        match(V5, m_LShr(m_Specific(V6), m_ConstantInt(ShiftConst3))) &&
+        match(V6, m_Or(m_Value(V7), m_Value(V8))) &&
+        match(V7, m_LShr(m_Specific(V8), m_ConstantInt(ShiftConst2))) &&
+        match(V8, m_Or(m_Value(V9), m_Value(V10))) &&
+        match(V9, m_LShr(m_Specific(V10), m_ConstantInt(ShiftConst1)))))
+    return false;
+
+  unsigned InputBits = V10->getType()->getScalarSizeInBits();
+  if (InputBits != 32)
+    return false;
+
+  // Shift should extract top 5..7 bits.
+  if (InputBits - Log2_32(InputBits) != ShiftConst6 &&
+      InputBits - Log2_32(InputBits) - 1 != ShiftConst6)
+    return false;
+
+  CtlzConstants TableConstants = {MulConst,    ShiftConst1, ShiftConst2,
+                                  ShiftConst3, ShiftConst4, ShiftConst5,
+                                  ShiftConst6};
+  if (!isCTLZTable(*ConstData, TableConstants, InputBits))
+    return false;
+
+  auto ZeroTableElem = ConstData->getElementAsInteger(0);
+  bool DefinedForZero = ZeroTableElem == InputBits;
+
+  IRBuilder<> B(LI);
+  ConstantInt *BoolConst = B.getInt1(!DefinedForZero);
+  Type *XType = V10->getType();
+  auto *Ctlz = B.CreateIntrinsic(Intrinsic::ctlz, {XType}, {V10, BoolConst});
+  Value *ZExtOrTrunc = nullptr;
+
+  if (DefinedForZero) {
+    ZExtOrTrunc = B.CreateZExtOrTrunc(Ctlz, AccessType);
+  } else {
+    // If the value in elem 0 isn't the same as InputBits, we still want to
+    // produce the value from the table.
+    auto *Cmp = B.CreateICmpEQ(V10, ConstantInt::get(XType, 0));
+    auto *Select = B.CreateSelect(Cmp, ConstantInt::get(XType, 31), Ctlz);
+
+    ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType);
+  }
+
+  I.replaceAllUsesWith(ZExtOrTrunc);
+
+  return true;
+}
+
 /// This is used by foldLoadsRecursive() to capture a Root Load node which is
 /// of type or(load, load) and recursively build the wide load. Also capture the
 /// shift amount, zero extend type and loadSize.
@@ -1828,6 +1986,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       MadeChange |= tryToRecognizePopCount(I);
       MadeChange |= tryToFPToSat(I, TTI);
       MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
+      MadeChange |= tryToRecognizeTableBasedCtlz(I);
       MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
       MadeChange |= foldPatternedLoads(I, DL);
       MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-ctlz.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-ctlz.ll
new file mode 100644
index 0000000000000..3b1bf9a345165
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-ctlz.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=aggressive-instcombine -S | FileCheck %s
+
+ at ZSTD_countLeadingZeros32_fallback.DeBruijnClz = internal unnamed_addr constant [32 x i32] [i32 0, i32 9, i32 1, i32 10, i32 13, i32 21, i32 2, i32 29, i32 11, i32 14, i32 16, i32 18, i32 22, i32 25, i32 3, i32 30, i32 8, i32 12, i32 20, i32 28, i32 15, i32 17, i32 24, i32 7, i32 19, i32 27, i32 23, i32 6, i32 26, i32 5, i32 4, i32 31], align 16
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @ZSTD_countLeadingZeros32_fallback(i32 noundef %val) local_unnamed_addr {
+; CHECK-LABEL: define dso_local i32 @ZSTD_countLeadingZeros32_fallback
+; CHECK-SAME: (i32 noundef [[VAL:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[VAL]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 (i32, ...) @assert(i32 noundef [[CONV]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[VAL]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[VAL]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 32, i32 [[TMP0]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %cmp = icmp ne i32 %val, 0
+  %conv = zext i1 %cmp to i32
+  %call = tail call i32 (i32, ...) @assert(i32 noundef %conv)
+  %shr = lshr i32 %val, 1
+  %or = or i32 %shr, %val
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i32], ptr @ZSTD_countLeadingZeros32_fallback.DeBruijnClz, i64 0, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %sub = sub i32 31, %0
+  ret i32 %sub
+}
+
+declare i32 @assert(...) local_unnamed_addr