[llvm] [AggressiveInstCombine] Recognize table based log2 and replace with ctlz+sub. (PR #185160)

Thu Mar 12 08:26:29 PDT 2026

https://github.com/topperc updated https://github.com/llvm/llvm-project/pull/185160

>From 67a2c5d87fb4cfbbf3e616a72556452307c8085e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 6 Mar 2026 21:13:42 -0800
Subject: [PATCH 1/4] [AggressiveInstCombine] Recognize table based log2 and
 replace with ctlz+sub.

Recognize table based log2 implementations like

unsigned log2(unsigned v) {
  static const unsigned char table[] = {
    0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25,  3, 30,
    8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5,  4, 31
  };

  v |= v >> 1;
  v |= v >> 2;
  v |= v >> 4;
  v |= v >> 8;
  v |= v >> 16;

  return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
}

and replaces with 31 - llvm.ctlz(v).

Similar for i64 log2. Other sizes can be supported with correct
multiply constant and table values, but I have not found examples yet.

This code is based on the existing tryToRecognizeTableBasedCttz.
It handles the same pattern as #177110, but does not match the
outer subtract from that patch. It is assumed that InstCombine or
other optmizations can combine (sub 31 (sub 31, cttz V)) later.

I have limited this to targets that have a fast ctlz. The backend
does not yet have a table based lowering for ctlz so this reduces
the chance of regressions.
---
 .../AggressiveInstCombine.cpp                 | 200 +++++++++++++
 .../X86/lower-table-based-log2-basics.ll      | 141 ++++++++++
 .../X86/lower-table-based-log2-negative.ll    | 264 ++++++++++++++++++
 3 files changed, 605 insertions(+)
 create mode 100644 llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
 create mode 100644 llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 797fabde3f25c..aea8fc607ea6f 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -626,6 +626,205 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
   return true;
 }
 
+// Check if this array of constants represents a log2 table.
+// Iterate over the elements from \p Table by trying to find/match all
+// the numbers from 0 to \p InputBits that should represent log2 results.
+static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
+                        const APInt &AndMask, Type *AccessTy,
+                        unsigned InputBits, const APInt &GEPIdxFactor,
+                        const DataLayout &DL) {
+  for (unsigned Idx = 0; Idx < InputBits; Idx++) {
+    APInt Index =
+        (APInt::getLowBitsSet(InputBits, Idx + 1) * Mul).lshr(Shift) & AndMask;
+    ConstantInt *C = dyn_cast_or_null<ConstantInt>(
+        ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
+    if (!C || C->getValue() != Idx)
+      return false;
+  }
+
+  // Verify that an input of zero will select table index 0.
+  APInt ZeroIndex = Mul.lshr(Shift) & AndMask;
+  if (!ZeroIndex.isZero())
+    return false;
+
+  return true;
+}
+
+// Try to recognize table-based log2 implementation.
+// E.g., an exmapel in C (for more cases please the llvm/tests):
+// int f(unsigned x) {
+//    static const char table[32] =
+//    {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+//     8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
+//
+//    v |= v >> 1; // first round down to one less than a power of 2
+//    v |= v >> 2;
+//    v |= v >> 4;
+//    v |= v >> 8;
+//    v |= v >> 16;
+//
+//    return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
+// }
+// this can be lowered to `ctlz` instruction.
+// There is also a special case when the element is 0.
+//
+// The >> and |= sequence sets all bits below the most significant set bit. The
+// multiply is a de-bruijn sequence that contains each pattern of bits in it.
+// The shift extracts the top bits after the multiply, and that index into the
+// table should represent the floor log base 2 of the original number.
+//
+// Here are some examples of LLVM IR for a 64-bit target.
+//
+// CASE 1:
+// %shr = lshr i32 %v, 1
+// %or = or i32 %shr, %v
+// %shr1 = lshr i32 %or, 2
+// %or2 = or i32 %shr1, %or
+// %shr3 = lshr i32 %or2, 4
+// %or4 = or i32 %shr3, %or2
+// %shr5 = lshr i32 %or4, 8
+// %or6 = or i32 %shr5, %or4
+// %shr7 = lshr i32 %or6, 16
+// %or8 = or i32 %shr7, %or6
+// %mul = mul i32 %or8, 130329821
+// %shr9 = lshr i32 %mul, 27
+// %idxprom = zext nneg i32 %shr9 to i64
+// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %idxprom
+// %0 = load i8, ptr %arrayidx, align 1
+//
+// CASE 2:
+// %shr = lshr i64 %v, 1
+// %or = or i64 %shr, %v
+// %shr1 = lshr i64 %or, 2
+// %or2 = or i64 %shr1, %or
+// %shr3 = lshr i64 %or2, 4
+// %or4 = or i64 %shr3, %or2
+// %shr5 = lshr i64 %or4, 8
+// %or6 = or i64 %shr5, %or4
+// %shr7 = lshr i64 %or6, 16
+// %or8 = or i64 %shr7, %or6
+// %shr9 = lshr i64 %or8, 32
+// %or10 = or i64 %shr9, %or8
+// %mul = mul i64 %or10, 285870213051386505
+// %shr11 = lshr i64 %mul, 58
+// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %shr11
+// %0 = load i8, ptr %arrayidx, align 1/
+//
+// All these can be lowered to @llvm.cttz.i32/64 intrinsics and a subtract.
+static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
+                                         TargetTransformInfo &TTI) {
+  LoadInst *LI = dyn_cast<LoadInst>(&I);
+  if (!LI)
+    return false;
+
+  Type *AccessType = LI->getType();
+  if (!AccessType->isIntegerTy())
+    return false;
+
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
+  if (!GEP || !GEP->hasNoUnsignedSignedWrap())
+    return false;
+
+  GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
+  if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
+    return false;
+
+  unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+  APInt ModOffset(BW, 0);
+  SmallMapVector<Value *, APInt, 4> VarOffsets;
+  if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
+      VarOffsets.size() != 1 || ModOffset != 0)
+    return false;
+  auto [GepIdx, GEPScale] = VarOffsets.front();
+
+  Value *X;
+  const APInt *MulConst, *ShiftConst, *AndCst = nullptr;
+  // Check that the gep variable index is (x * MulConst) >> ShiftConst.
+  auto MatchInner =
+      m_LShr(m_Mul(m_Value(X), m_APInt(MulConst)), m_APInt(ShiftConst));
+  if (!match(GepIdx, m_CastOrSelf(MatchInner)))
+    return false;
+
+  unsigned InputBits = X->getType()->getScalarSizeInBits();
+  // TODO: Support more sizes.
+  if (InputBits != 32 && InputBits != 64)
+    return false;
+
+  // Verify shift amount.
+  // TODO: Allow other shift amounts when we have proper test coverage.
+  if (*ShiftConst != InputBits - Log2_32(InputBits))
+    return false;
+
+  if (InputBits >= 64) {
+    Value *Y;
+    if (!match(X, m_c_Or(m_LShr(m_Value(Y), m_SpecificInt(32)), m_Deferred(Y))))
+      return false;
+    X = Y;
+  }
+
+  Value *Y1, *Y2, *Y3, *Y4, *Y5;
+  if (!match(X,
+             m_c_Or(m_LShr(m_Value(Y1), m_SpecificInt(16)), m_Deferred(Y1))) ||
+      !match(Y1,
+             m_c_Or(m_LShr(m_Value(Y2), m_SpecificInt(8)), m_Deferred(Y2))) ||
+      !match(Y2,
+             m_c_Or(m_LShr(m_Value(Y3), m_SpecificInt(4)), m_Deferred(Y3))) ||
+      !match(Y3,
+             m_c_Or(m_LShr(m_Value(Y4), m_SpecificInt(2)), m_Deferred(Y4))) ||
+      !match(Y4, m_c_Or(m_LShr(m_Value(Y5), m_SpecificInt(1)), m_Deferred(Y5))))
+    return false;
+
+  X = Y5;
+
+  if (!GEPScale.isIntN(InputBits) ||
+      !isLog2Table(GVTable->getInitializer(), *MulConst, *ShiftConst,
+                   AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
+                   InputBits, GEPScale.zextOrTrunc(InputBits), DL))
+    return false;
+
+  ConstantInt *ZeroTableElem = cast<ConstantInt>(
+      ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
+
+  // Use InputBits - 1 - ctlz(X) to compute log2(X).
+  IRBuilder<> B(LI);
+  ConstantInt *BoolConst = B.getTrue();
+  Type *XType = X->getType();
+
+  // Check the the backend has an efficient ctlz instruction.
+  // FIXME: Teach the backend to emit the original code when ctlz isn't
+  // supported like we do for cttz.
+  IntrinsicCostAttributes Attrs(
+      Intrinsic::ctlz, XType,
+      {PoisonValue::get(XType), /*is_zero_poison=*/BoolConst});
+  InstructionCost Cost =
+      TTI.getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
+  if (Cost > TargetTransformInfo::TCC_Basic)
+    return false;
+
+  Value *Ctlz = B.CreateIntrinsic(Intrinsic::ctlz, {XType}, {X, BoolConst});
+
+  Constant *InputBitsM1 = ConstantInt::get(XType, InputBits - 1);
+  Value *Sub = B.CreateSub(InputBitsM1, Ctlz);
+
+  // The table won't produce a sensible result for 0.
+  Value *Cmp = B.CreateICmpEQ(X, ConstantInt::get(XType, 0));
+  Value *Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Sub);
+
+  // The true branch of select handles the log2(0) case, which is rare.
+  if (!ProfcheckDisableMetadataFixes) {
+    if (Instruction *SelectI = dyn_cast<Instruction>(Select))
+      SelectI->setMetadata(
+          LLVMContext::MD_prof,
+          MDBuilder(SelectI->getContext()).createUnlikelyBranchWeights());
+  }
+
+  Value *ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType);
+
+  LI->replaceAllUsesWith(ZExtOrTrunc);
+
+  return true;
+}
+
 /// This is used by foldLoadsRecursive() to capture a Root Load node which is
 /// of type or(load, load) and recursively build the wide load. Also capture the
 /// shift amount, zero extend type and loadSize.
@@ -1828,6 +2027,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       MadeChange |= tryToRecognizePopCount(I);
       MadeChange |= tryToFPToSat(I, TTI);
       MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
+      MadeChange |= tryToRecognizeTableBasedLog2(I, DL, TTI);
       MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
       MadeChange |= foldPatternedLoads(I, DL);
       MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
new file mode 100644
index 0000000000000..c163b9768408b
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=aggressive-instcombine -mtriple=x86_64 -mattr=+lzcnt -S < %s | FileCheck %s
+
+;; These cases test lowering of various implementations of table-based log2
+;; algorithms to the llvm.ctlz instruction.
+
+;; C reproducers:
+;; int log2(unsigned v) {
+;;   static const unsigned char table[] = {
+;;     0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25,  3, 30,
+;;     8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5,  4, 31
+;;   };
+;;
+;;   v |= v >> 1;
+;;   v |= v >> 2;
+;;   v |= v >> 4;
+;;   v |= v >> 8;
+;;   v |= v >> 16;
+;;
+;;   return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
+;; }
+;; int log2_64(unsigned long long v) {
+;;   static const unsigned char table[] = {
+;;      0, 47,  1, 56, 48, 27,  2, 60, 57, 49, 41, 37, 28, 16,  3, 61,
+;;     54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11,  4, 62,
+;;     46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
+;;     25, 39, 14, 33, 19, 30,  9, 24, 13, 18,  8, 12,  7,  6,  5, 63
+;;   };
+;;
+;;   v |= v >> 1;
+;;   v |= v >> 2;
+;;   v |= v >> 4;
+;;   v |= v >> 8;
+;;   v |= v >> 16;
+;;   v |= v >> 32;
+;;
+;;   return table[(v * 0x03F79D71B4CB0A89ULL) >> 58];
+;; }
+
+ at log2.table = internal unnamed_addr constant [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+define i32 @log2_32(i32 %v) !prof !0 {
+; CHECK-LABEL: @log2_32(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 31, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[V]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]], !prof [[PROF_1:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @log2_32_nusw(i32 %v) {
+; CHECK-LABEL: @log2_32_nusw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 31, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[V]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr nusw [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+ at log2_64.table = internal unnamed_addr constant [64 x i8] c"\00/\0180\1B\02<91)%\1C\10\03=6:#42*\15,& \1D\17\11\0B\04>.7\1A;($\0F5\223\14+\1F\16\0A-\19'\0E!\13\1E\09\18\0D\12\08\0C\07\06\05?", align 1
+
+define i32 @log2_64(i64 noundef %v) {
+; CHECK-LABEL: @log2_64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[V:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 63, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[V]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i64 %v, 1
+  %or = or i64 %shr, %v
+  %shr1 = lshr i64 %or, 2
+  %or2 = or i64 %shr1, %or
+  %shr3 = lshr i64 %or2, 4
+  %or4 = or i64 %shr3, %or2
+  %shr5 = lshr i64 %or4, 8
+  %or6 = or i64 %shr5, %or4
+  %shr7 = lshr i64 %or6, 16
+  %or8 = or i64 %shr7, %or6
+  %shr9 = lshr i64 %or8, 32
+  %or10 = or i64 %shr9, %or8
+  %mul = mul i64 %or10, 285870213051386505
+  %shr11 = lshr i64 %mul, 58
+  %arrayidx = getelementptr inbounds i8, ptr @log2_64.table, i64 %shr11
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"branch_weights", i32 1, i32 1048575}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll
new file mode 100644
index 0000000000000..4968b01eceee1
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=aggressive-instcombine -mtriple=x86_64 -mattr=+lzcnt -S < %s | FileCheck %s
+
+ at log2.table = internal unnamed_addr constant [32 x i8] c"\05\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+;; This is a negative test with a wrong table constant.
+
+define i32 @log2_32(i32 %v) {
+; CHECK-LABEL: @log2_32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 16
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+ at log2_2.table = internal unnamed_addr constant [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+;; These are some negative tests with a wrong instruction sequences.
+
+define i32 @log2_32_2(i32 %v) {
+; CHECK-LABEL: @log2_32_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 15
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @log2_32_3(i32 %v) {
+; CHECK-LABEL: @log2_32_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 16
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329822
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329822
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @log2_32_4(i32 %v) {
+; CHECK-LABEL: @log2_32_4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 26
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 15
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 26
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+;; A test with an extern global variable representing the table.
+
+ at table = external global [32 x i8], align 1
+define i32 @log2_32_5(i32 %v) {
+; CHECK-LABEL: @log2_32_5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 26
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 15
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 26
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+;; We want only constant tables to be considered as LOG2 ones.
+ at log2_3.table = global [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+;; These are some negative tests with a wrong instruction sequences.
+
+define i32 @log2_32_6(i32 %v) {
+; CHECK-LABEL: @log2_32_6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_3.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 15
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_3.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}

>From 493ffcd870ece5c40d2356026c74dd70ea27b980 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Wed, 11 Mar 2026 14:14:38 -0700
Subject: [PATCH 2/4] fixup! Address review comments

---
 .../AggressiveInstCombine.cpp                 | 28 ++++++-------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index aea8fc607ea6f..7b877a27d8848 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -651,8 +651,8 @@ static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
 }
 
 // Try to recognize table-based log2 implementation.
-// E.g., an exmapel in C (for more cases please the llvm/tests):
-// int f(unsigned x) {
+// E.g., an example in C (for more cases please the llvm/tests):
+// int f(unsigned v) {
 //    static const char table[32] =
 //    {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
 //     8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
@@ -708,9 +708,9 @@ static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
 // %mul = mul i64 %or10, 285870213051386505
 // %shr11 = lshr i64 %mul, 58
 // %arrayidx = getelementptr inbounds i8, ptr @table, i64 %shr11
-// %0 = load i8, ptr %arrayidx, align 1/
+// %0 = load i8, ptr %arrayidx, align 1
 //
-// All these can be lowered to @llvm.cttz.i32/64 intrinsics and a subtract.
+// All these can be lowered to @llvm.ctlz.i32/64 intrinsics and a subtract.
 static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
                                          TargetTransformInfo &TTI) {
   LoadInst *LI = dyn_cast<LoadInst>(&I);
@@ -755,27 +755,15 @@ static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
   if (*ShiftConst != InputBits - Log2_32(InputBits))
     return false;
 
-  if (InputBits >= 64) {
+  // Match the sequence of OR operations with right shifts by powers of 2.
+  for (unsigned ShiftAmt = InputBits / 2; ShiftAmt != 0; ShiftAmt /= 2) {
     Value *Y;
-    if (!match(X, m_c_Or(m_LShr(m_Value(Y), m_SpecificInt(32)), m_Deferred(Y))))
+    if (!match(X, m_c_Or(m_LShr(m_Value(Y), m_SpecificInt(ShiftAmt)),
+                         m_Deferred(Y))))
       return false;
     X = Y;
   }
 
-  Value *Y1, *Y2, *Y3, *Y4, *Y5;
-  if (!match(X,
-             m_c_Or(m_LShr(m_Value(Y1), m_SpecificInt(16)), m_Deferred(Y1))) ||
-      !match(Y1,
-             m_c_Or(m_LShr(m_Value(Y2), m_SpecificInt(8)), m_Deferred(Y2))) ||
-      !match(Y2,
-             m_c_Or(m_LShr(m_Value(Y3), m_SpecificInt(4)), m_Deferred(Y3))) ||
-      !match(Y3,
-             m_c_Or(m_LShr(m_Value(Y4), m_SpecificInt(2)), m_Deferred(Y4))) ||
-      !match(Y4, m_c_Or(m_LShr(m_Value(Y5), m_SpecificInt(1)), m_Deferred(Y5))))
-    return false;
-
-  X = Y5;
-
   if (!GEPScale.isIntN(InputBits) ||
       !isLog2Table(GVTable->getInitializer(), *MulConst, *ShiftConst,
                    AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,

>From 77d80f1b34c12e841c5d38f8ee1569fdc4b460d2 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Wed, 11 Mar 2026 14:37:38 -0700
Subject: [PATCH 3/4] fixup! Remove unused AndCst.

---
 .../AggressiveInstCombine.cpp                     | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7b877a27d8848..03f56f1d4b6eb 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -630,12 +630,10 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
 // Iterate over the elements from \p Table by trying to find/match all
 // the numbers from 0 to \p InputBits that should represent log2 results.
 static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
-                        const APInt &AndMask, Type *AccessTy,
-                        unsigned InputBits, const APInt &GEPIdxFactor,
-                        const DataLayout &DL) {
+                        Type *AccessTy, unsigned InputBits,
+                        const APInt &GEPIdxFactor, const DataLayout &DL) {
   for (unsigned Idx = 0; Idx < InputBits; Idx++) {
-    APInt Index =
-        (APInt::getLowBitsSet(InputBits, Idx + 1) * Mul).lshr(Shift) & AndMask;
+    APInt Index = (APInt::getLowBitsSet(InputBits, Idx + 1) * Mul).lshr(Shift);
     ConstantInt *C = dyn_cast_or_null<ConstantInt>(
         ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
     if (!C || C->getValue() != Idx)
@@ -643,7 +641,7 @@ static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
   }
 
   // Verify that an input of zero will select table index 0.
-  APInt ZeroIndex = Mul.lshr(Shift) & AndMask;
+  APInt ZeroIndex = Mul.lshr(Shift);
   if (!ZeroIndex.isZero())
     return false;
 
@@ -738,7 +736,7 @@ static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
   auto [GepIdx, GEPScale] = VarOffsets.front();
 
   Value *X;
-  const APInt *MulConst, *ShiftConst, *AndCst = nullptr;
+  const APInt *MulConst, *ShiftConst;
   // Check that the gep variable index is (x * MulConst) >> ShiftConst.
   auto MatchInner =
       m_LShr(m_Mul(m_Value(X), m_APInt(MulConst)), m_APInt(ShiftConst));
@@ -766,8 +764,7 @@ static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
 
   if (!GEPScale.isIntN(InputBits) ||
       !isLog2Table(GVTable->getInitializer(), *MulConst, *ShiftConst,
-                   AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
-                   InputBits, GEPScale.zextOrTrunc(InputBits), DL))
+                   AccessType, InputBits, GEPScale.zextOrTrunc(InputBits), DL))
     return false;
 
   ConstantInt *ZeroTableElem = cast<ConstantInt>(

>From 4b94b5087e37bd5dc726cfa0961f32b807278fc3 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Thu, 12 Mar 2026 08:26:03 -0700
Subject: [PATCH 4/4] fixup! Add i16 support

---
 .../AggressiveInstCombine.cpp                 |  2 +-
 .../X86/lower-table-based-log2-basics.ll      | 33 +++++++++++++++++--
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 03f56f1d4b6eb..43ec326e1d0b4 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -745,7 +745,7 @@ static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
 
   unsigned InputBits = X->getType()->getScalarSizeInBits();
   // TODO: Support more sizes.
-  if (InputBits != 32 && InputBits != 64)
+  if (InputBits != 16 && InputBits != 32 && InputBits != 64)
     return false;
 
   // Verify shift amount.
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
index c163b9768408b..de6225451b3bb 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
@@ -77,7 +77,7 @@ define i32 @log2_32_nusw(i32 %v) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 true)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 31, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[V]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]], !prof [[PROF1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
@@ -110,7 +110,7 @@ define i32 @log2_64(i64 noundef %v) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[V:%.*]], i1 true)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 63, [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[V]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]], !prof [[PROF1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i8
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
@@ -136,6 +136,35 @@ entry:
   ret i32 %conv
 }
 
+ at log2_16.table = internal unnamed_addr constant [16 x i8] c"\00\07\01\0D\08\0A\02\0E\06\0C\09\05\0B\04\03\0F", align 1
+
+define i32 @log2_16(i16 noundef %0) {
+; CHECK-LABEL: @log2_16(
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.ctlz.i16(i16 [[TMP0:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i16 15, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i16 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i16 0, i16 [[TMP3]], !prof [[PROF1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i16 [[TMP5]] to i8
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP14]] to i32
+; CHECK-NEXT:    ret i32 [[TMP15]]
+;
+  %2 = lshr i16 %0, 1
+  %3 = or i16 %2, %0
+  %4 = lshr i16 %3, 2
+  %5 = or i16 %4, %3
+  %6 = lshr i16 %5, 4
+  %7 = or i16 %6, %5
+  %8 = lshr i16 %7, 8
+  %9 = or i16 %8, %7
+  %10 = mul i16 %9, 3885
+  %11 = lshr i16 %10, 12
+  %12 = zext nneg i16 %11 to i64
+  %13 = getelementptr inbounds nuw i8, ptr @log2_16.table, i64 %12
+  %14 = load i8, ptr %13, align 1
+  %15 = zext i8 %14 to i32
+  ret i32 %15
+}
+
 !0 = !{!"function_entry_count", i64 1000}
 ; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
 ; CHECK: [[PROF_1]] = !{!"branch_weights", i32 1, i32 1048575}