[llvm] [AggressiveInstCombine] Recognize table based log2 and replace with ctlz+sub. (PR #185160)

Fri Mar 6 23:29:35 PST 2026

https://github.com/topperc created https://github.com/llvm/llvm-project/pull/185160

Recognize table based log2 implementations like

unsigned log2(unsigned v) {
  static const unsigned char table[] = {
    0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25,  3, 30,
    8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5,  4, 31
  };

  v |= v >> 1;
  v |= v >> 2;
  v |= v >> 4;
  v |= v >> 8;
  v |= v >> 16;

  return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
}

and replaces with 31 - llvm.ctlz(v).

Similar for i64 log2. Other sizes can be supported with correct multiply constant and table values, but I have not found examples yet.

This code is based on the existing tryToRecognizeTableBasedCttz. Like that function, we support
any combination of multiply constant and table values that produce the correct result.

It handles the same pattern as #177110, but does not match the outer subtract from that patch. It is assumed that InstCombine or other optimizations can combine (sub 31 (sub 31, cttz V)) later.

I have limited this to targets that have a fast ctlz. The backend does not yet have a table based lowering for ctlz so this reduces the chance of regressions.

CC: @rohitaggarwal007

>From 67a2c5d87fb4cfbbf3e616a72556452307c8085e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 6 Mar 2026 21:13:42 -0800
Subject: [PATCH] [AggressiveInstCombine] Recognize table based log2 and
 replace with ctlz+sub.

Recognize table based log2 implementations like

unsigned log2(unsigned v) {
  static const unsigned char table[] = {
    0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25,  3, 30,
    8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5,  4, 31
  };

  v |= v >> 1;
  v |= v >> 2;
  v |= v >> 4;
  v |= v >> 8;
  v |= v >> 16;

  return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
}

and replaces with 31 - llvm.ctlz(v).

Similar for i64 log2. Other sizes can be supported with correct
multiply constant and table values, but I have not found examples yet.

This code is based on the existing tryToRecognizeTableBasedCttz.
It handles the same pattern as #177110, but does not match the
outer subtract from that patch. It is assumed that InstCombine or
other optmizations can combine (sub 31 (sub 31, cttz V)) later.

I have limited this to targets that have a fast ctlz. The backend
does not yet have a table based lowering for ctlz so this reduces
the chance of regressions.
---
 .../AggressiveInstCombine.cpp                 | 200 +++++++++++++
 .../X86/lower-table-based-log2-basics.ll      | 141 ++++++++++
 .../X86/lower-table-based-log2-negative.ll    | 264 ++++++++++++++++++
 3 files changed, 605 insertions(+)
 create mode 100644 llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
 create mode 100644 llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 797fabde3f25c..aea8fc607ea6f 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -626,6 +626,205 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
   return true;
 }
 
+// Check if this array of constants represents a log2 table.
+// Iterate over the elements from \p Table by trying to find/match all
+// the numbers from 0 to \p InputBits that should represent log2 results.
+static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
+                        const APInt &AndMask, Type *AccessTy,
+                        unsigned InputBits, const APInt &GEPIdxFactor,
+                        const DataLayout &DL) {
+  for (unsigned Idx = 0; Idx < InputBits; Idx++) {
+    APInt Index =
+        (APInt::getLowBitsSet(InputBits, Idx + 1) * Mul).lshr(Shift) & AndMask;
+    ConstantInt *C = dyn_cast_or_null<ConstantInt>(
+        ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
+    if (!C || C->getValue() != Idx)
+      return false;
+  }
+
+  // Verify that an input of zero will select table index 0.
+  APInt ZeroIndex = Mul.lshr(Shift) & AndMask;
+  if (!ZeroIndex.isZero())
+    return false;
+
+  return true;
+}
+
+// Try to recognize table-based log2 implementation.
+// E.g., an exmapel in C (for more cases please the llvm/tests):
+// int f(unsigned x) {
+//    static const char table[32] =
+//    {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+//     8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
+//
+//    v |= v >> 1; // first round down to one less than a power of 2
+//    v |= v >> 2;
+//    v |= v >> 4;
+//    v |= v >> 8;
+//    v |= v >> 16;
+//
+//    return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
+// }
+// this can be lowered to `ctlz` instruction.
+// There is also a special case when the element is 0.
+//
+// The >> and |= sequence sets all bits below the most significant set bit. The
+// multiply is a de-bruijn sequence that contains each pattern of bits in it.
+// The shift extracts the top bits after the multiply, and that index into the
+// table should represent the floor log base 2 of the original number.
+//
+// Here are some examples of LLVM IR for a 64-bit target.
+//
+// CASE 1:
+// %shr = lshr i32 %v, 1
+// %or = or i32 %shr, %v
+// %shr1 = lshr i32 %or, 2
+// %or2 = or i32 %shr1, %or
+// %shr3 = lshr i32 %or2, 4
+// %or4 = or i32 %shr3, %or2
+// %shr5 = lshr i32 %or4, 8
+// %or6 = or i32 %shr5, %or4
+// %shr7 = lshr i32 %or6, 16
+// %or8 = or i32 %shr7, %or6
+// %mul = mul i32 %or8, 130329821
+// %shr9 = lshr i32 %mul, 27
+// %idxprom = zext nneg i32 %shr9 to i64
+// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %idxprom
+// %0 = load i8, ptr %arrayidx, align 1
+//
+// CASE 2:
+// %shr = lshr i64 %v, 1
+// %or = or i64 %shr, %v
+// %shr1 = lshr i64 %or, 2
+// %or2 = or i64 %shr1, %or
+// %shr3 = lshr i64 %or2, 4
+// %or4 = or i64 %shr3, %or2
+// %shr5 = lshr i64 %or4, 8
+// %or6 = or i64 %shr5, %or4
+// %shr7 = lshr i64 %or6, 16
+// %or8 = or i64 %shr7, %or6
+// %shr9 = lshr i64 %or8, 32
+// %or10 = or i64 %shr9, %or8
+// %mul = mul i64 %or10, 285870213051386505
+// %shr11 = lshr i64 %mul, 58
+// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %shr11
+// %0 = load i8, ptr %arrayidx, align 1/
+//
+// All these can be lowered to @llvm.cttz.i32/64 intrinsics and a subtract.
+static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
+                                         TargetTransformInfo &TTI) {
+  LoadInst *LI = dyn_cast<LoadInst>(&I);
+  if (!LI)
+    return false;
+
+  Type *AccessType = LI->getType();
+  if (!AccessType->isIntegerTy())
+    return false;
+
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
+  if (!GEP || !GEP->hasNoUnsignedSignedWrap())
+    return false;
+
+  GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
+  if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
+    return false;
+
+  unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+  APInt ModOffset(BW, 0);
+  SmallMapVector<Value *, APInt, 4> VarOffsets;
+  if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
+      VarOffsets.size() != 1 || ModOffset != 0)
+    return false;
+  auto [GepIdx, GEPScale] = VarOffsets.front();
+
+  Value *X;
+  const APInt *MulConst, *ShiftConst, *AndCst = nullptr;
+  // Check that the gep variable index is (x * MulConst) >> ShiftConst.
+  auto MatchInner =
+      m_LShr(m_Mul(m_Value(X), m_APInt(MulConst)), m_APInt(ShiftConst));
+  if (!match(GepIdx, m_CastOrSelf(MatchInner)))
+    return false;
+
+  unsigned InputBits = X->getType()->getScalarSizeInBits();
+  // TODO: Support more sizes.
+  if (InputBits != 32 && InputBits != 64)
+    return false;
+
+  // Verify shift amount.
+  // TODO: Allow other shift amounts when we have proper test coverage.
+  if (*ShiftConst != InputBits - Log2_32(InputBits))
+    return false;
+
+  if (InputBits >= 64) {
+    Value *Y;
+    if (!match(X, m_c_Or(m_LShr(m_Value(Y), m_SpecificInt(32)), m_Deferred(Y))))
+      return false;
+    X = Y;
+  }
+
+  Value *Y1, *Y2, *Y3, *Y4, *Y5;
+  if (!match(X,
+             m_c_Or(m_LShr(m_Value(Y1), m_SpecificInt(16)), m_Deferred(Y1))) ||
+      !match(Y1,
+             m_c_Or(m_LShr(m_Value(Y2), m_SpecificInt(8)), m_Deferred(Y2))) ||
+      !match(Y2,
+             m_c_Or(m_LShr(m_Value(Y3), m_SpecificInt(4)), m_Deferred(Y3))) ||
+      !match(Y3,
+             m_c_Or(m_LShr(m_Value(Y4), m_SpecificInt(2)), m_Deferred(Y4))) ||
+      !match(Y4, m_c_Or(m_LShr(m_Value(Y5), m_SpecificInt(1)), m_Deferred(Y5))))
+    return false;
+
+  X = Y5;
+
+  if (!GEPScale.isIntN(InputBits) ||
+      !isLog2Table(GVTable->getInitializer(), *MulConst, *ShiftConst,
+                   AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
+                   InputBits, GEPScale.zextOrTrunc(InputBits), DL))
+    return false;
+
+  ConstantInt *ZeroTableElem = cast<ConstantInt>(
+      ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
+
+  // Use InputBits - 1 - ctlz(X) to compute log2(X).
+  IRBuilder<> B(LI);
+  ConstantInt *BoolConst = B.getTrue();
+  Type *XType = X->getType();
+
+  // Check the the backend has an efficient ctlz instruction.
+  // FIXME: Teach the backend to emit the original code when ctlz isn't
+  // supported like we do for cttz.
+  IntrinsicCostAttributes Attrs(
+      Intrinsic::ctlz, XType,
+      {PoisonValue::get(XType), /*is_zero_poison=*/BoolConst});
+  InstructionCost Cost =
+      TTI.getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
+  if (Cost > TargetTransformInfo::TCC_Basic)
+    return false;
+
+  Value *Ctlz = B.CreateIntrinsic(Intrinsic::ctlz, {XType}, {X, BoolConst});
+
+  Constant *InputBitsM1 = ConstantInt::get(XType, InputBits - 1);
+  Value *Sub = B.CreateSub(InputBitsM1, Ctlz);
+
+  // The table won't produce a sensible result for 0.
+  Value *Cmp = B.CreateICmpEQ(X, ConstantInt::get(XType, 0));
+  Value *Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Sub);
+
+  // The true branch of select handles the log2(0) case, which is rare.
+  if (!ProfcheckDisableMetadataFixes) {
+    if (Instruction *SelectI = dyn_cast<Instruction>(Select))
+      SelectI->setMetadata(
+          LLVMContext::MD_prof,
+          MDBuilder(SelectI->getContext()).createUnlikelyBranchWeights());
+  }
+
+  Value *ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType);
+
+  LI->replaceAllUsesWith(ZExtOrTrunc);
+
+  return true;
+}
+
 /// This is used by foldLoadsRecursive() to capture a Root Load node which is
 /// of type or(load, load) and recursively build the wide load. Also capture the
 /// shift amount, zero extend type and loadSize.
@@ -1828,6 +2027,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       MadeChange |= tryToRecognizePopCount(I);
       MadeChange |= tryToFPToSat(I, TTI);
       MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
+      MadeChange |= tryToRecognizeTableBasedLog2(I, DL, TTI);
       MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
       MadeChange |= foldPatternedLoads(I, DL);
       MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
new file mode 100644
index 0000000000000..c163b9768408b
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=aggressive-instcombine -mtriple=x86_64 -mattr=+lzcnt -S < %s | FileCheck %s
+
+;; These cases test lowering of various implementations of table-based log2
+;; algorithms to the llvm.ctlz instruction.
+
+;; C reproducers:
+;; int log2(unsigned v) {
+;;   static const unsigned char table[] = {
+;;     0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25,  3, 30,
+;;     8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5,  4, 31
+;;   };
+;;
+;;   v |= v >> 1;
+;;   v |= v >> 2;
+;;   v |= v >> 4;
+;;   v |= v >> 8;
+;;   v |= v >> 16;
+;;
+;;   return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
+;; }
+;; int log2_64(unsigned long long v) {
+;;   static const unsigned char table[] = {
+;;      0, 47,  1, 56, 48, 27,  2, 60, 57, 49, 41, 37, 28, 16,  3, 61,
+;;     54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11,  4, 62,
+;;     46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
+;;     25, 39, 14, 33, 19, 30,  9, 24, 13, 18,  8, 12,  7,  6,  5, 63
+;;   };
+;;
+;;   v |= v >> 1;
+;;   v |= v >> 2;
+;;   v |= v >> 4;
+;;   v |= v >> 8;
+;;   v |= v >> 16;
+;;   v |= v >> 32;
+;;
+;;   return table[(v * 0x03F79D71B4CB0A89ULL) >> 58];
+;; }
+
+ at log2.table = internal unnamed_addr constant [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+define i32 @log2_32(i32 %v) !prof !0 {
+; CHECK-LABEL: @log2_32(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 31, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[V]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]], !prof [[PROF_1:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @log2_32_nusw(i32 %v) {
+; CHECK-LABEL: @log2_32_nusw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 31, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[V]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr nusw [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+ at log2_64.table = internal unnamed_addr constant [64 x i8] c"\00/\0180\1B\02<91)%\1C\10\03=6:#42*\15,& \1D\17\11\0B\04>.7\1A;($\0F5\223\14+\1F\16\0A-\19'\0E!\13\1E\09\18\0D\12\08\0C\07\06\05?", align 1
+
+define i32 @log2_64(i64 noundef %v) {
+; CHECK-LABEL: @log2_64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[V:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 63, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[V]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i64 %v, 1
+  %or = or i64 %shr, %v
+  %shr1 = lshr i64 %or, 2
+  %or2 = or i64 %shr1, %or
+  %shr3 = lshr i64 %or2, 4
+  %or4 = or i64 %shr3, %or2
+  %shr5 = lshr i64 %or4, 8
+  %or6 = or i64 %shr5, %or4
+  %shr7 = lshr i64 %or6, 16
+  %or8 = or i64 %shr7, %or6
+  %shr9 = lshr i64 %or8, 32
+  %or10 = or i64 %shr9, %or8
+  %mul = mul i64 %or10, 285870213051386505
+  %shr11 = lshr i64 %mul, 58
+  %arrayidx = getelementptr inbounds i8, ptr @log2_64.table, i64 %shr11
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"branch_weights", i32 1, i32 1048575}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll
new file mode 100644
index 0000000000000..4968b01eceee1
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=aggressive-instcombine -mtriple=x86_64 -mattr=+lzcnt -S < %s | FileCheck %s
+
+ at log2.table = internal unnamed_addr constant [32 x i8] c"\05\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+;; This is a negative test with a wrong table constant.
+
+define i32 @log2_32(i32 %v) {
+; CHECK-LABEL: @log2_32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 16
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+ at log2_2.table = internal unnamed_addr constant [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+;; These are some negative tests with a wrong instruction sequences.
+
+define i32 @log2_32_2(i32 %v) {
+; CHECK-LABEL: @log2_32_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 15
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @log2_32_3(i32 %v) {
+; CHECK-LABEL: @log2_32_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 16
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329822
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329822
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @log2_32_4(i32 %v) {
+; CHECK-LABEL: @log2_32_4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 26
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 15
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 26
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+;; A test with an extern global variable representing the table.
+
+ at table = external global [32 x i8], align 1
+define i32 @log2_32_5(i32 %v) {
+; CHECK-LABEL: @log2_32_5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 26
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 15
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 26
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+;; We want only constant tables to be considered as LOG2 ones.
+ at log2_3.table = global [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+;; These are some negative tests with a wrong instruction sequences.
+
+define i32 @log2_32_6(i32 %v) {
+; CHECK-LABEL: @log2_32_6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_3.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 15
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_3.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}