[llvm] [AggressiveInstCombine] Recognize table based log2 and replace with ctlz+sub. (PR #185160)
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 12 08:26:29 PDT 2026
https://github.com/topperc updated https://github.com/llvm/llvm-project/pull/185160
>From 67a2c5d87fb4cfbbf3e616a72556452307c8085e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Fri, 6 Mar 2026 21:13:42 -0800
Subject: [PATCH 1/4] [AggressiveInstCombine] Recognize table based log2 and
replace with ctlz+sub.
Recognize table based log2 implementations like
unsigned log2(unsigned v) {
static const unsigned char table[] = {
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
};
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
}
and replaces with 31 - llvm.ctlz(v).
Similar for i64 log2. Other sizes can be supported with correct
multiply constant and table values, but I have not found examples yet.
This code is based on the existing tryToRecognizeTableBasedCttz.
It handles the same pattern as #177110, but does not match the
outer subtract from that patch. It is assumed that InstCombine or
other optmizations can combine (sub 31 (sub 31, cttz V)) later.
I have limited this to targets that have a fast ctlz. The backend
does not yet have a table based lowering for ctlz so this reduces
the chance of regressions.
---
.../AggressiveInstCombine.cpp | 200 +++++++++++++
.../X86/lower-table-based-log2-basics.ll | 141 ++++++++++
.../X86/lower-table-based-log2-negative.ll | 264 ++++++++++++++++++
3 files changed, 605 insertions(+)
create mode 100644 llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
create mode 100644 llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 797fabde3f25c..aea8fc607ea6f 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -626,6 +626,205 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
return true;
}
+// Check if this array of constants represents a log2 table.
+// Iterate over the elements from \p Table by trying to find/match all
+// the numbers from 0 to \p InputBits that should represent log2 results.
+static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
+ const APInt &AndMask, Type *AccessTy,
+ unsigned InputBits, const APInt &GEPIdxFactor,
+ const DataLayout &DL) {
+ for (unsigned Idx = 0; Idx < InputBits; Idx++) {
+ APInt Index =
+ (APInt::getLowBitsSet(InputBits, Idx + 1) * Mul).lshr(Shift) & AndMask;
+ ConstantInt *C = dyn_cast_or_null<ConstantInt>(
+ ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
+ if (!C || C->getValue() != Idx)
+ return false;
+ }
+
+ // Verify that an input of zero will select table index 0.
+ APInt ZeroIndex = Mul.lshr(Shift) & AndMask;
+ if (!ZeroIndex.isZero())
+ return false;
+
+ return true;
+}
+
+// Try to recognize table-based log2 implementation.
+// E.g., an exmapel in C (for more cases please the llvm/tests):
+// int f(unsigned x) {
+// static const char table[32] =
+// {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+// 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
+//
+// v |= v >> 1; // first round down to one less than a power of 2
+// v |= v >> 2;
+// v |= v >> 4;
+// v |= v >> 8;
+// v |= v >> 16;
+//
+// return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
+// }
+// this can be lowered to `ctlz` instruction.
+// There is also a special case when the element is 0.
+//
+// The >> and |= sequence sets all bits below the most significant set bit. The
+// multiply is a de-bruijn sequence that contains each pattern of bits in it.
+// The shift extracts the top bits after the multiply, and that index into the
+// table should represent the floor log base 2 of the original number.
+//
+// Here are some examples of LLVM IR for a 64-bit target.
+//
+// CASE 1:
+// %shr = lshr i32 %v, 1
+// %or = or i32 %shr, %v
+// %shr1 = lshr i32 %or, 2
+// %or2 = or i32 %shr1, %or
+// %shr3 = lshr i32 %or2, 4
+// %or4 = or i32 %shr3, %or2
+// %shr5 = lshr i32 %or4, 8
+// %or6 = or i32 %shr5, %or4
+// %shr7 = lshr i32 %or6, 16
+// %or8 = or i32 %shr7, %or6
+// %mul = mul i32 %or8, 130329821
+// %shr9 = lshr i32 %mul, 27
+// %idxprom = zext nneg i32 %shr9 to i64
+// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %idxprom
+// %0 = load i8, ptr %arrayidx, align 1
+//
+// CASE 2:
+// %shr = lshr i64 %v, 1
+// %or = or i64 %shr, %v
+// %shr1 = lshr i64 %or, 2
+// %or2 = or i64 %shr1, %or
+// %shr3 = lshr i64 %or2, 4
+// %or4 = or i64 %shr3, %or2
+// %shr5 = lshr i64 %or4, 8
+// %or6 = or i64 %shr5, %or4
+// %shr7 = lshr i64 %or6, 16
+// %or8 = or i64 %shr7, %or6
+// %shr9 = lshr i64 %or8, 32
+// %or10 = or i64 %shr9, %or8
+// %mul = mul i64 %or10, 285870213051386505
+// %shr11 = lshr i64 %mul, 58
+// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %shr11
+// %0 = load i8, ptr %arrayidx, align 1/
+//
+// All these can be lowered to @llvm.cttz.i32/64 intrinsics and a subtract.
+static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
+ TargetTransformInfo &TTI) {
+ LoadInst *LI = dyn_cast<LoadInst>(&I);
+ if (!LI)
+ return false;
+
+ Type *AccessType = LI->getType();
+ if (!AccessType->isIntegerTy())
+ return false;
+
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
+ if (!GEP || !GEP->hasNoUnsignedSignedWrap())
+ return false;
+
+ GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
+ if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
+ return false;
+
+ unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+ APInt ModOffset(BW, 0);
+ SmallMapVector<Value *, APInt, 4> VarOffsets;
+ if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
+ VarOffsets.size() != 1 || ModOffset != 0)
+ return false;
+ auto [GepIdx, GEPScale] = VarOffsets.front();
+
+ Value *X;
+ const APInt *MulConst, *ShiftConst, *AndCst = nullptr;
+ // Check that the gep variable index is (x * MulConst) >> ShiftConst.
+ auto MatchInner =
+ m_LShr(m_Mul(m_Value(X), m_APInt(MulConst)), m_APInt(ShiftConst));
+ if (!match(GepIdx, m_CastOrSelf(MatchInner)))
+ return false;
+
+ unsigned InputBits = X->getType()->getScalarSizeInBits();
+ // TODO: Support more sizes.
+ if (InputBits != 32 && InputBits != 64)
+ return false;
+
+ // Verify shift amount.
+ // TODO: Allow other shift amounts when we have proper test coverage.
+ if (*ShiftConst != InputBits - Log2_32(InputBits))
+ return false;
+
+ if (InputBits >= 64) {
+ Value *Y;
+ if (!match(X, m_c_Or(m_LShr(m_Value(Y), m_SpecificInt(32)), m_Deferred(Y))))
+ return false;
+ X = Y;
+ }
+
+ Value *Y1, *Y2, *Y3, *Y4, *Y5;
+ if (!match(X,
+ m_c_Or(m_LShr(m_Value(Y1), m_SpecificInt(16)), m_Deferred(Y1))) ||
+ !match(Y1,
+ m_c_Or(m_LShr(m_Value(Y2), m_SpecificInt(8)), m_Deferred(Y2))) ||
+ !match(Y2,
+ m_c_Or(m_LShr(m_Value(Y3), m_SpecificInt(4)), m_Deferred(Y3))) ||
+ !match(Y3,
+ m_c_Or(m_LShr(m_Value(Y4), m_SpecificInt(2)), m_Deferred(Y4))) ||
+ !match(Y4, m_c_Or(m_LShr(m_Value(Y5), m_SpecificInt(1)), m_Deferred(Y5))))
+ return false;
+
+ X = Y5;
+
+ if (!GEPScale.isIntN(InputBits) ||
+ !isLog2Table(GVTable->getInitializer(), *MulConst, *ShiftConst,
+ AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
+ InputBits, GEPScale.zextOrTrunc(InputBits), DL))
+ return false;
+
+ ConstantInt *ZeroTableElem = cast<ConstantInt>(
+ ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
+
+ // Use InputBits - 1 - ctlz(X) to compute log2(X).
+ IRBuilder<> B(LI);
+ ConstantInt *BoolConst = B.getTrue();
+ Type *XType = X->getType();
+
+ // Check the the backend has an efficient ctlz instruction.
+ // FIXME: Teach the backend to emit the original code when ctlz isn't
+ // supported like we do for cttz.
+ IntrinsicCostAttributes Attrs(
+ Intrinsic::ctlz, XType,
+ {PoisonValue::get(XType), /*is_zero_poison=*/BoolConst});
+ InstructionCost Cost =
+ TTI.getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
+ if (Cost > TargetTransformInfo::TCC_Basic)
+ return false;
+
+ Value *Ctlz = B.CreateIntrinsic(Intrinsic::ctlz, {XType}, {X, BoolConst});
+
+ Constant *InputBitsM1 = ConstantInt::get(XType, InputBits - 1);
+ Value *Sub = B.CreateSub(InputBitsM1, Ctlz);
+
+ // The table won't produce a sensible result for 0.
+ Value *Cmp = B.CreateICmpEQ(X, ConstantInt::get(XType, 0));
+ Value *Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Sub);
+
+ // The true branch of select handles the log2(0) case, which is rare.
+ if (!ProfcheckDisableMetadataFixes) {
+ if (Instruction *SelectI = dyn_cast<Instruction>(Select))
+ SelectI->setMetadata(
+ LLVMContext::MD_prof,
+ MDBuilder(SelectI->getContext()).createUnlikelyBranchWeights());
+ }
+
+ Value *ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType);
+
+ LI->replaceAllUsesWith(ZExtOrTrunc);
+
+ return true;
+}
+
/// This is used by foldLoadsRecursive() to capture a Root Load node which is
/// of type or(load, load) and recursively build the wide load. Also capture the
/// shift amount, zero extend type and loadSize.
@@ -1828,6 +2027,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
MadeChange |= tryToRecognizePopCount(I);
MadeChange |= tryToFPToSat(I, TTI);
MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
+ MadeChange |= tryToRecognizeTableBasedLog2(I, DL, TTI);
MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
MadeChange |= foldPatternedLoads(I, DL);
MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
new file mode 100644
index 0000000000000..c163b9768408b
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=aggressive-instcombine -mtriple=x86_64 -mattr=+lzcnt -S < %s | FileCheck %s
+
+;; These cases test lowering of various implementations of table-based log2
+;; algorithms to the llvm.ctlz instruction.
+
+;; C reproducers:
+;; int log2(unsigned v) {
+;; static const unsigned char table[] = {
+;; 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+;; 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
+;; };
+;;
+;; v |= v >> 1;
+;; v |= v >> 2;
+;; v |= v >> 4;
+;; v |= v >> 8;
+;; v |= v >> 16;
+;;
+;; return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
+;; }
+;; int log2_64(unsigned long long v) {
+;; static const unsigned char table[] = {
+;; 0, 47, 1, 56, 48, 27, 2, 60, 57, 49, 41, 37, 28, 16, 3, 61,
+;; 54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11, 4, 62,
+;; 46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
+;; 25, 39, 14, 33, 19, 30, 9, 24, 13, 18, 8, 12, 7, 6, 5, 63
+;; };
+;;
+;; v |= v >> 1;
+;; v |= v >> 2;
+;; v |= v >> 4;
+;; v |= v >> 8;
+;; v |= v >> 16;
+;; v |= v >> 32;
+;;
+;; return table[(v * 0x03F79D71B4CB0A89ULL) >> 58];
+;; }
+
+ at log2.table = internal unnamed_addr constant [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+define i32 @log2_32(i32 %v) !prof !0 {
+; CHECK-LABEL: @log2_32(
+; CHECK: !prof [[PROF_0:![0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 true)
+; CHECK-NEXT: [[TMP1:%.*]] = sub i32 31, [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[V]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]], !prof [[PROF_1:![0-9]+]]
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT: ret i32 [[CONV]]
+;
+entry:
+ %shr = lshr i32 %v, 1
+ %or = or i32 %shr, %v
+ %shr1 = lshr i32 %or, 2
+ %or2 = or i32 %shr1, %or
+ %shr3 = lshr i32 %or2, 4
+ %or4 = or i32 %shr3, %or2
+ %shr5 = lshr i32 %or4, 8
+ %or6 = or i32 %shr5, %or4
+ %shr7 = lshr i32 %or6, 16
+ %or8 = or i32 %shr7, %or6
+ %mul = mul i32 %or8, 130329821
+ %shr9 = lshr i32 %mul, 27
+ %idxprom = zext nneg i32 %shr9 to i64
+ %arrayidx = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %0 to i32
+ ret i32 %conv
+}
+
+define i32 @log2_32_nusw(i32 %v) {
+; CHECK-LABEL: @log2_32_nusw(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 true)
+; CHECK-NEXT: [[TMP1:%.*]] = sub i32 31, [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[V]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT: ret i32 [[CONV]]
+;
+entry:
+ %shr = lshr i32 %v, 1
+ %or = or i32 %shr, %v
+ %shr1 = lshr i32 %or, 2
+ %or2 = or i32 %shr1, %or
+ %shr3 = lshr i32 %or2, 4
+ %or4 = or i32 %shr3, %or2
+ %shr5 = lshr i32 %or4, 8
+ %or6 = or i32 %shr5, %or4
+ %shr7 = lshr i32 %or6, 16
+ %or8 = or i32 %shr7, %or6
+ %mul = mul i32 %or8, 130329821
+ %shr9 = lshr i32 %mul, 27
+ %idxprom = zext nneg i32 %shr9 to i64
+ %arrayidx = getelementptr nusw [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %0 to i32
+ ret i32 %conv
+}
+
+ at log2_64.table = internal unnamed_addr constant [64 x i8] c"\00/\0180\1B\02<91)%\1C\10\03=6:#42*\15,& \1D\17\11\0B\04>.7\1A;($\0F5\223\14+\1F\16\0A-\19'\0E!\13\1E\09\18\0D\12\08\0C\07\06\05?", align 1
+
+define i32 @log2_64(i64 noundef %v) {
+; CHECK-LABEL: @log2_64(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[V:%.*]], i1 true)
+; CHECK-NEXT: [[TMP1:%.*]] = sub i64 63, [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[V]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i8
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT: ret i32 [[CONV]]
+;
+entry:
+ %shr = lshr i64 %v, 1
+ %or = or i64 %shr, %v
+ %shr1 = lshr i64 %or, 2
+ %or2 = or i64 %shr1, %or
+ %shr3 = lshr i64 %or2, 4
+ %or4 = or i64 %shr3, %or2
+ %shr5 = lshr i64 %or4, 8
+ %or6 = or i64 %shr5, %or4
+ %shr7 = lshr i64 %or6, 16
+ %or8 = or i64 %shr7, %or6
+ %shr9 = lshr i64 %or8, 32
+ %or10 = or i64 %shr9, %or8
+ %mul = mul i64 %or10, 285870213051386505
+ %shr11 = lshr i64 %mul, 58
+ %arrayidx = getelementptr inbounds i8, ptr @log2_64.table, i64 %shr11
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %0 to i32
+ ret i32 %conv
+}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF_1]] = !{!"branch_weights", i32 1, i32 1048575}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll
new file mode 100644
index 0000000000000..4968b01eceee1
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=aggressive-instcombine -mtriple=x86_64 -mattr=+lzcnt -S < %s | FileCheck %s
+
+ at log2.table = internal unnamed_addr constant [32 x i8] c"\05\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+;; This is a negative test with a wrong table constant.
+
+define i32 @log2_32(i32 %v) {
+; CHECK-LABEL: @log2_32(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT: [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT: [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT: [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT: [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT: [[SHR7:%.*]] = lshr i32 [[OR6]], 16
+; CHECK-NEXT: [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT: [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT: ret i32 [[CONV]]
+;
+entry:
+ %shr = lshr i32 %v, 1
+ %or = or i32 %shr, %v
+ %shr1 = lshr i32 %or, 2
+ %or2 = or i32 %shr1, %or
+ %shr3 = lshr i32 %or2, 4
+ %or4 = or i32 %shr3, %or2
+ %shr5 = lshr i32 %or4, 8
+ %or6 = or i32 %shr5, %or4
+ %shr7 = lshr i32 %or6, 16
+ %or8 = or i32 %shr7, %or6
+ %mul = mul i32 %or8, 130329821
+ %shr9 = lshr i32 %mul, 27
+ %idxprom = zext nneg i32 %shr9 to i64
+ %arrayidx = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %0 to i32
+ ret i32 %conv
+}
+
+ at log2_2.table = internal unnamed_addr constant [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+;; These are some negative tests with a wrong instruction sequences.
+
+define i32 @log2_32_2(i32 %v) {
+; CHECK-LABEL: @log2_32_2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT: [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT: [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT: [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT: [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT: [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT: [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT: [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT: ret i32 [[CONV]]
+;
+entry:
+ %shr = lshr i32 %v, 1
+ %or = or i32 %shr, %v
+ %shr1 = lshr i32 %or, 2
+ %or2 = or i32 %shr1, %or
+ %shr3 = lshr i32 %or2, 4
+ %or4 = or i32 %shr3, %or2
+ %shr5 = lshr i32 %or4, 8
+ %or6 = or i32 %shr5, %or4
+ %shr7 = lshr i32 %or6, 15
+ %or8 = or i32 %shr7, %or6
+ %mul = mul i32 %or8, 130329821
+ %shr9 = lshr i32 %mul, 27
+ %idxprom = zext nneg i32 %shr9 to i64
+ %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %0 to i32
+ ret i32 %conv
+}
+
+define i32 @log2_32_3(i32 %v) {
+; CHECK-LABEL: @log2_32_3(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT: [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT: [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT: [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT: [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT: [[SHR7:%.*]] = lshr i32 [[OR6]], 16
+; CHECK-NEXT: [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[OR8]], 130329822
+; CHECK-NEXT: [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT: ret i32 [[CONV]]
+;
+entry:
+ %shr = lshr i32 %v, 1
+ %or = or i32 %shr, %v
+ %shr1 = lshr i32 %or, 2
+ %or2 = or i32 %shr1, %or
+ %shr3 = lshr i32 %or2, 4
+ %or4 = or i32 %shr3, %or2
+ %shr5 = lshr i32 %or4, 8
+ %or6 = or i32 %shr5, %or4
+ %shr7 = lshr i32 %or6, 16
+ %or8 = or i32 %shr7, %or6
+ %mul = mul i32 %or8, 130329822
+ %shr9 = lshr i32 %mul, 27
+ %idxprom = zext nneg i32 %shr9 to i64
+ %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %0 to i32
+ ret i32 %conv
+}
+
+define i32 @log2_32_4(i32 %v) {
+; CHECK-LABEL: @log2_32_4(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT: [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT: [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT: [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT: [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT: [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT: [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT: [[SHR9:%.*]] = lshr i32 [[MUL]], 26
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT: ret i32 [[CONV]]
+;
+entry:
+ %shr = lshr i32 %v, 1
+ %or = or i32 %shr, %v
+ %shr1 = lshr i32 %or, 2
+ %or2 = or i32 %shr1, %or
+ %shr3 = lshr i32 %or2, 4
+ %or4 = or i32 %shr3, %or2
+ %shr5 = lshr i32 %or4, 8
+ %or6 = or i32 %shr5, %or4
+ %shr7 = lshr i32 %or6, 15
+ %or8 = or i32 %shr7, %or6
+ %mul = mul i32 %or8, 130329821
+ %shr9 = lshr i32 %mul, 26
+ %idxprom = zext nneg i32 %shr9 to i64
+ %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %0 to i32
+ ret i32 %conv
+}
+
+;; A test with an extern global variable representing the table.
+
+ at table = external global [32 x i8], align 1
+define i32 @log2_32_5(i32 %v) {
+; CHECK-LABEL: @log2_32_5(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT: [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT: [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT: [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT: [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT: [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT: [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT: [[SHR9:%.*]] = lshr i32 [[MUL]], 26
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT: ret i32 [[CONV]]
+;
+entry:
+ %shr = lshr i32 %v, 1
+ %or = or i32 %shr, %v
+ %shr1 = lshr i32 %or, 2
+ %or2 = or i32 %shr1, %or
+ %shr3 = lshr i32 %or2, 4
+ %or4 = or i32 %shr3, %or2
+ %shr5 = lshr i32 %or4, 8
+ %or6 = or i32 %shr5, %or4
+ %shr7 = lshr i32 %or6, 15
+ %or8 = or i32 %shr7, %or6
+ %mul = mul i32 %or8, 130329821
+ %shr9 = lshr i32 %mul, 26
+ %idxprom = zext nneg i32 %shr9 to i64
+ %arrayidx = getelementptr inbounds [32 x i8], ptr @table, i64 0, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %0 to i32
+ ret i32 %conv
+}
+
+;; We want only constant tables to be considered as LOG2 ones.
+ at log2_3.table = global [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+;; These are some negative tests with a wrong instruction sequences.
+
+define i32 @log2_32_6(i32 %v) {
+; CHECK-LABEL: @log2_32_6(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT: [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT: [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT: [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT: [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT: [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT: [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT: [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT: [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT: [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT: [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_3.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT: ret i32 [[CONV]]
+;
+entry:
+ %shr = lshr i32 %v, 1
+ %or = or i32 %shr, %v
+ %shr1 = lshr i32 %or, 2
+ %or2 = or i32 %shr1, %or
+ %shr3 = lshr i32 %or2, 4
+ %or4 = or i32 %shr3, %or2
+ %shr5 = lshr i32 %or4, 8
+ %or6 = or i32 %shr5, %or4
+ %shr7 = lshr i32 %or6, 15
+ %or8 = or i32 %shr7, %or6
+ %mul = mul i32 %or8, 130329821
+ %shr9 = lshr i32 %mul, 27
+ %idxprom = zext nneg i32 %shr9 to i64
+ %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_3.table, i64 0, i64 %idxprom
+ %0 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %0 to i32
+ ret i32 %conv
+}
>From 493ffcd870ece5c40d2356026c74dd70ea27b980 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Wed, 11 Mar 2026 14:14:38 -0700
Subject: [PATCH 2/4] fixup! Address review comments
---
.../AggressiveInstCombine.cpp | 28 ++++++-------------
1 file changed, 8 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index aea8fc607ea6f..7b877a27d8848 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -651,8 +651,8 @@ static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
}
// Try to recognize table-based log2 implementation.
-// E.g., an exmapel in C (for more cases please the llvm/tests):
-// int f(unsigned x) {
+// E.g., an example in C (for more cases please the llvm/tests):
+// int f(unsigned v) {
// static const char table[32] =
// {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
// 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
@@ -708,9 +708,9 @@ static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
// %mul = mul i64 %or10, 285870213051386505
// %shr11 = lshr i64 %mul, 58
// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %shr11
-// %0 = load i8, ptr %arrayidx, align 1/
+// %0 = load i8, ptr %arrayidx, align 1
//
-// All these can be lowered to @llvm.cttz.i32/64 intrinsics and a subtract.
+// All these can be lowered to @llvm.ctlz.i32/64 intrinsics and a subtract.
static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
TargetTransformInfo &TTI) {
LoadInst *LI = dyn_cast<LoadInst>(&I);
@@ -755,27 +755,15 @@ static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
if (*ShiftConst != InputBits - Log2_32(InputBits))
return false;
- if (InputBits >= 64) {
+ // Match the sequence of OR operations with right shifts by powers of 2.
+ for (unsigned ShiftAmt = InputBits / 2; ShiftAmt != 0; ShiftAmt /= 2) {
Value *Y;
- if (!match(X, m_c_Or(m_LShr(m_Value(Y), m_SpecificInt(32)), m_Deferred(Y))))
+ if (!match(X, m_c_Or(m_LShr(m_Value(Y), m_SpecificInt(ShiftAmt)),
+ m_Deferred(Y))))
return false;
X = Y;
}
- Value *Y1, *Y2, *Y3, *Y4, *Y5;
- if (!match(X,
- m_c_Or(m_LShr(m_Value(Y1), m_SpecificInt(16)), m_Deferred(Y1))) ||
- !match(Y1,
- m_c_Or(m_LShr(m_Value(Y2), m_SpecificInt(8)), m_Deferred(Y2))) ||
- !match(Y2,
- m_c_Or(m_LShr(m_Value(Y3), m_SpecificInt(4)), m_Deferred(Y3))) ||
- !match(Y3,
- m_c_Or(m_LShr(m_Value(Y4), m_SpecificInt(2)), m_Deferred(Y4))) ||
- !match(Y4, m_c_Or(m_LShr(m_Value(Y5), m_SpecificInt(1)), m_Deferred(Y5))))
- return false;
-
- X = Y5;
-
if (!GEPScale.isIntN(InputBits) ||
!isLog2Table(GVTable->getInitializer(), *MulConst, *ShiftConst,
AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
>From 77d80f1b34c12e841c5d38f8ee1569fdc4b460d2 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Wed, 11 Mar 2026 14:37:38 -0700
Subject: [PATCH 3/4] fixup! Remove unused AndCst.
---
.../AggressiveInstCombine.cpp | 15 ++++++---------
1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7b877a27d8848..03f56f1d4b6eb 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -630,12 +630,10 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
// Iterate over the elements from \p Table by trying to find/match all
// the numbers from 0 to \p InputBits that should represent log2 results.
static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
- const APInt &AndMask, Type *AccessTy,
- unsigned InputBits, const APInt &GEPIdxFactor,
- const DataLayout &DL) {
+ Type *AccessTy, unsigned InputBits,
+ const APInt &GEPIdxFactor, const DataLayout &DL) {
for (unsigned Idx = 0; Idx < InputBits; Idx++) {
- APInt Index =
- (APInt::getLowBitsSet(InputBits, Idx + 1) * Mul).lshr(Shift) & AndMask;
+ APInt Index = (APInt::getLowBitsSet(InputBits, Idx + 1) * Mul).lshr(Shift);
ConstantInt *C = dyn_cast_or_null<ConstantInt>(
ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
if (!C || C->getValue() != Idx)
@@ -643,7 +641,7 @@ static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
}
// Verify that an input of zero will select table index 0.
- APInt ZeroIndex = Mul.lshr(Shift) & AndMask;
+ APInt ZeroIndex = Mul.lshr(Shift);
if (!ZeroIndex.isZero())
return false;
@@ -738,7 +736,7 @@ static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
auto [GepIdx, GEPScale] = VarOffsets.front();
Value *X;
- const APInt *MulConst, *ShiftConst, *AndCst = nullptr;
+ const APInt *MulConst, *ShiftConst;
// Check that the gep variable index is (x * MulConst) >> ShiftConst.
auto MatchInner =
m_LShr(m_Mul(m_Value(X), m_APInt(MulConst)), m_APInt(ShiftConst));
@@ -766,8 +764,7 @@ static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
if (!GEPScale.isIntN(InputBits) ||
!isLog2Table(GVTable->getInitializer(), *MulConst, *ShiftConst,
- AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
- InputBits, GEPScale.zextOrTrunc(InputBits), DL))
+ AccessType, InputBits, GEPScale.zextOrTrunc(InputBits), DL))
return false;
ConstantInt *ZeroTableElem = cast<ConstantInt>(
>From 4b94b5087e37bd5dc726cfa0961f32b807278fc3 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Thu, 12 Mar 2026 08:26:03 -0700
Subject: [PATCH 4/4] fixup! Add i16 support
---
.../AggressiveInstCombine.cpp | 2 +-
.../X86/lower-table-based-log2-basics.ll | 33 +++++++++++++++++--
2 files changed, 32 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 03f56f1d4b6eb..43ec326e1d0b4 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -745,7 +745,7 @@ static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
unsigned InputBits = X->getType()->getScalarSizeInBits();
// TODO: Support more sizes.
- if (InputBits != 32 && InputBits != 64)
+ if (InputBits != 16 && InputBits != 32 && InputBits != 64)
return false;
// Verify shift amount.
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
index c163b9768408b..de6225451b3bb 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
@@ -77,7 +77,7 @@ define i32 @log2_32_nusw(i32 %v) {
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = sub i32 31, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[V]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]], !prof [[PROF1]]
; CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP4]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
@@ -110,7 +110,7 @@ define i32 @log2_64(i64 noundef %v) {
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[V:%.*]], i1 true)
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 63, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[V]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]], !prof [[PROF1]]
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i8
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP4]] to i32
; CHECK-NEXT: ret i32 [[CONV]]
@@ -136,6 +136,35 @@ entry:
ret i32 %conv
}
+ at log2_16.table = internal unnamed_addr constant [16 x i8] c"\00\07\01\0D\08\0A\02\0E\06\0C\09\05\0B\04\03\0F", align 1
+
+define i32 @log2_16(i16 noundef %0) {
+; CHECK-LABEL: @log2_16(
+; CHECK-NEXT: [[TMP2:%.*]] = call i16 @llvm.ctlz.i16(i16 [[TMP0:%.*]], i1 true)
+; CHECK-NEXT: [[TMP3:%.*]] = sub i16 15, [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i16 [[TMP0]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i16 0, i16 [[TMP3]], !prof [[PROF1]]
+; CHECK-NEXT: [[TMP14:%.*]] = trunc i16 [[TMP5]] to i8
+; CHECK-NEXT: [[TMP15:%.*]] = zext i8 [[TMP14]] to i32
+; CHECK-NEXT: ret i32 [[TMP15]]
+;
+ %2 = lshr i16 %0, 1
+ %3 = or i16 %2, %0
+ %4 = lshr i16 %3, 2
+ %5 = or i16 %4, %3
+ %6 = lshr i16 %5, 4
+ %7 = or i16 %6, %5
+ %8 = lshr i16 %7, 8
+ %9 = or i16 %8, %7
+ %10 = mul i16 %9, 3885
+ %11 = lshr i16 %10, 12
+ %12 = zext nneg i16 %11 to i64
+ %13 = getelementptr inbounds nuw i8, ptr @log2_16.table, i64 %12
+ %14 = load i8, ptr %13, align 1
+ %15 = zext i8 %14 to i32
+ ret i32 %15
+}
+
!0 = !{!"function_entry_count", i64 1000}
; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000}
; CHECK: [[PROF_1]] = !{!"branch_weights", i32 1, i32 1048575}
More information about the llvm-commits
mailing list