[llvm] [AggressiveInstCombine] Make cttz fold more resiliant to non-array geps (PR #150896)

Thu Jul 31 02:05:13 PDT 2025

https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/150896

>From 762250f91441ef19b02426c12e36dad58586bdc5 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 28 Jul 2025 08:29:52 +0100
Subject: [PATCH 1/5] [AggressiveInstCombine] Make cttz fold more resiliant to
 non-array geps

---
 .../AggressiveInstCombine.cpp                 | 32 +++++++------
 .../lower-table-based-cttz-basics.ll          | 47 +++++++++++++++++++
 .../PhaseOrdering/lower-table-based-cttz.ll   | 42 +++++++++++++++--
 3 files changed, 101 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7af5ba4e0e103..975590214ffa6 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -547,14 +547,20 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
     return false;
 
   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
-  if (!GEP || !GEP->hasNoUnsignedSignedWrap() || GEP->getNumIndices() != 2)
+  if (!GEP || !GEP->hasNoUnsignedSignedWrap())
     return false;
 
-  if (!GEP->getSourceElementType()->isArrayTy())
-    return false;
-
-  uint64_t ArraySize = GEP->getSourceElementType()->getArrayNumElements();
-  if (ArraySize != 32 && ArraySize != 64)
+  Type *GEPSrcEltTy = GEP->getSourceElementType();
+  Value *GepIdx;
+  if (GEP->getNumIndices() == 2) {
+    if (!GEPSrcEltTy->isArrayTy() ||
+        !match(GEP->idx_begin()->get(), m_ZeroInt()))
+      return false;
+    GEPSrcEltTy = GEPSrcEltTy->getArrayElementType();
+    GepIdx = std::next(GEP->idx_begin())->get();
+  } else if (GEP->getNumIndices() == 1)
+    GepIdx = GEP->idx_begin()->get();
+  else
     return false;
 
   GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
@@ -563,21 +569,17 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
 
   ConstantDataArray *ConstData =
       dyn_cast<ConstantDataArray>(GVTable->getInitializer());
-  if (!ConstData)
-    return false;
-
-  if (!match(GEP->idx_begin()->get(), m_ZeroInt()))
+  if (!ConstData || ConstData->getElementType() != GEPSrcEltTy)
     return false;
 
-  Value *Idx2 = std::next(GEP->idx_begin())->get();
   Value *X1;
   uint64_t MulConst, ShiftConst;
   // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
   // probably fail for other (e.g. 32-bit) targets.
-  if (!match(Idx2, m_ZExtOrSelf(
-                       m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
-                                    m_ConstantInt(MulConst)),
-                              m_ConstantInt(ShiftConst)))))
+  if (!match(GepIdx, m_ZExtOrSelf(m_LShr(
+                         m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
+                               m_ConstantInt(MulConst)),
+                         m_ConstantInt(ShiftConst)))))
     return false;
 
   unsigned InputBits = X1->getType()->getScalarSizeInBits();
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
index 4d571999df372..0bfa891a7887c 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -276,3 +276,50 @@ entry:
   %0 = load i32, ptr %arrayidx, align 4
   ret i32 %0
 }
+
+define i32 @ctz1_with_i8_gep(i32 %x) {
+; CHECK-LABEL: @ctz1_with_i8_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds i8, ptr @ctz7.table, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @ctz2_with_i8_gep(i32 %x) {
+; CHECK-LABEL: @ctz2_with_i8_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SUB]], [[X]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[AND]], 72416175
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 26
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[SHR]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 72416175
+  %shr = lshr i32 %mul, 26
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 %idxprom
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  ret i32 %conv
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll b/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll
index 19fbc1f1ae64e..4455016c3e4a4 100644
--- a/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll
+++ b/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll
@@ -1,3 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -O3 -S < %s | FileCheck %s
+
 ;; This tests lowering of the implementations of table-based ctz
 ;; algorithm to the llvm.cttz instruction in the -O3 case.
 
@@ -13,13 +16,17 @@
 ;; }
 ;; Compiled as: clang -O3 test.c -S -emit-llvm -Xclang -disable-llvm-optzns
 
-; RUN: opt -O3 -S < %s | FileCheck %s
-
-; CHECK: call range(i32 0, 33) i32 @llvm.cttz.i32
-
 @ctz1.table = internal constant [32 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 16
 
-define i32 @ctz1(i32 noundef %x) {
+define i32 @ctz(i32 noundef %x) {
+; CHECK-LABEL: define range(i32 0, 32) i32 @ctz(
+; CHECK-SAME: i32 noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
 entry:
   %x.addr = alloca i32, align 4
   store i32 %x, ptr %x.addr, align 4
@@ -35,3 +42,28 @@ entry:
   %conv = sext i8 %2 to i32
   ret i32 %conv
 }
+
+define i32 @ctz_nonarraygep(i32 noundef %x) {
+; CHECK-LABEL: define range(i32 0, 32) i32 @ctz_nonarraygep(
+; CHECK-SAME: i32 noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %x.addr = alloca i32, align 4
+  store i32 %x, ptr %x.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %1 = load i32, ptr %x.addr, align 4
+  %sub = sub i32 0, %1
+  %and = and i32 %0, %sub
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds i8, ptr @ctz1.table, i64 %idxprom
+  %2 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %2 to i32
+  ret i32 %conv
+}

>From 12de7e7cf4b1c0cf39389edda132794bb68d041d Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 28 Jul 2025 21:33:16 +0100
Subject: [PATCH 2/5] Use ConstantFoldLoadFromConst, and get i16 tables working
 as a result

---
 .../AggressiveInstCombine.cpp                 | 58 +++++--------
 .../lower-table-based-cttz-basics.ll          | 82 ++++++++++++++++++-
 2 files changed, 102 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 975590214ffa6..f2986d9734528 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -457,30 +457,20 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
 
 // Check if this array of constants represents a cttz table.
 // Iterate over the elements from \p Table by trying to find/match all
-// the numbers from 0 to \p InputBits that should represent cttz results.
-static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
-                        uint64_t Shift, uint64_t InputBits) {
-  unsigned Length = Table.getNumElements();
-  if (Length < InputBits || Length > InputBits * 2)
-    return false;
-
-  APInt Mask = APInt::getBitsSetFrom(InputBits, Shift);
-  unsigned Matched = 0;
-
-  for (unsigned i = 0; i < Length; i++) {
-    uint64_t Element = Table.getElementAsInteger(i);
-    if (Element >= InputBits)
-      continue;
-
-    // Check if \p Element matches a concrete answer. It could fail for some
-    // elements that are never accessed, so we keep iterating over each element
-    // from the table. The number of matched elements should be equal to the
-    // number of potential right answers which is \p InputBits actually.
-    if ((((Mul << Element) & Mask.getZExtValue()) >> Shift) == i)
-      Matched++;
+// the numbers from 0 to \p InputTy->getSizeInBits() that should represent cttz
+// results.
+static bool isCTTZTable(Constant *Table, uint64_t Mul, uint64_t Shift,
+                        Type *AccessTy, unsigned InputBits,
+                        unsigned GEPIdxFactor, const DataLayout &DL) {
+  for (unsigned Idx = 0; Idx < InputBits; Idx++) {
+    APInt Index = (APInt(InputBits, 1ull << Idx) * Mul).lshr(Shift);
+    ConstantInt *C = dyn_cast_or_null<ConstantInt>(
+        ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
+    if (!C || C->getZExtValue() != Idx)
+      return false;
   }
 
-  return Matched == InputBits;
+  return true;
 }
 
 // Try to recognize table-based ctz implementation.
@@ -537,7 +527,7 @@ static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
 // %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
 //
 // All this can be lowered to @llvm.cttz.i32/64 intrinsic.
-static bool tryToRecognizeTableBasedCttz(Instruction &I) {
+static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
   LoadInst *LI = dyn_cast<LoadInst>(&I);
   if (!LI)
     return false;
@@ -567,11 +557,6 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
   if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
     return false;
 
-  ConstantDataArray *ConstData =
-      dyn_cast<ConstantDataArray>(GVTable->getInitializer());
-  if (!ConstData || ConstData->getElementType() != GEPSrcEltTy)
-    return false;
-
   Value *X1;
   uint64_t MulConst, ShiftConst;
   // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
@@ -583,19 +568,21 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
     return false;
 
   unsigned InputBits = X1->getType()->getScalarSizeInBits();
-  if (InputBits != 32 && InputBits != 64)
+  if (InputBits != 16 && InputBits != 32 && InputBits != 64)
     return false;
 
-  // Shift should extract top 5..7 bits.
+  // Shift should extract top 4..7 bits.
   if (InputBits - Log2_32(InputBits) != ShiftConst &&
       InputBits - Log2_32(InputBits) - 1 != ShiftConst)
     return false;
 
-  if (!isCTTZTable(*ConstData, MulConst, ShiftConst, InputBits))
+  if (!isCTTZTable(GVTable->getInitializer(), MulConst, ShiftConst, AccessType,
+                   InputBits, GEPSrcEltTy->getScalarSizeInBits() / 8, DL))
     return false;
 
-  auto ZeroTableElem = ConstData->getElementAsInteger(0);
-  bool DefinedForZero = ZeroTableElem == InputBits;
+  ConstantInt *ZeroTableElem = cast<ConstantInt>(
+      ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
+  bool DefinedForZero = ZeroTableElem->getZExtValue() == InputBits;
 
   IRBuilder<> B(LI);
   ConstantInt *BoolConst = B.getInt1(!DefinedForZero);
@@ -609,8 +596,7 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
     // If the value in elem 0 isn't the same as InputBits, we still want to
     // produce the value from the table.
     auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0));
-    auto Select =
-        B.CreateSelect(Cmp, ConstantInt::get(XType, ZeroTableElem), Cttz);
+    auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz);
 
     // NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target
     // it should be handled as: `cttz(x) & (typeSize - 1)`.
@@ -1479,7 +1465,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       MadeChange |= foldGuardedFunnelShift(I, DT);
       MadeChange |= tryToRecognizePopCount(I);
       MadeChange |= tryToFPToSat(I, TTI);
-      MadeChange |= tryToRecognizeTableBasedCttz(I);
+      MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
       MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
       MadeChange |= foldPatternedLoads(I, DL);
       MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
index 0bfa891a7887c..5732e146cd443 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -299,6 +299,7 @@ entry:
   ret i32 %conv
 }
 
+; This is the same a ctz2 (i16 table) with an i8 gep making the indices invalid
 define i32 @ctz2_with_i8_gep(i32 %x) {
 ; CHECK-LABEL: @ctz2_with_i8_gep(
 ; CHECK-NEXT:  entry:
@@ -308,7 +309,7 @@ define i32 @ctz2_with_i8_gep(i32 %x) {
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 26
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[SHR]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -319,7 +320,84 @@ entry:
   %shr = lshr i32 %mul, 26
   %idxprom = zext i32 %shr to i64
   %arrayidx = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 %idxprom
-  %0 = load i16, ptr %arrayidx, align 2
+  %0 = load i16, ptr %arrayidx, align 1
   %conv = sext i16 %0 to i32
   ret i32 %conv
 }
+
+; This is the same a ctz2_with_i8_gep but with the gep index multiplied by 2.
+define i32 @ctz2_with_i8_gep_fixed(i32 %x) {
+; CHECK-LABEL: @ctz2_with_i8_gep_fixed(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[SUB]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[AND]], 72416175
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 25
+; CHECK-NEXT:    [[SHR2:%.*]] = and i32 [[SHR]], 126
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[SHR2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr @ctz2.table, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %sub = sub i32 0, %x
+  %and = and i32 %x, %sub
+  %mul = mul i32 %and, 72416175
+  %shr = lshr i32 %mul, 25
+  %shr2 = and i32 %shr, 126
+  %1 = zext nneg i32 %shr2 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @ctz2.table, i64 %1
+  %2 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %2 to i32
+  ret i32 %conv
+}
+
+; This is a i16 input with the debruijn table stored in a single i128.
+ at tablei128 = internal unnamed_addr constant i128 16018378897745984667142067713738932480, align 16
+define i32 @cttz_i16_via_i128(i16 noundef %x) {
+; CHECK-LABEL: @cttz_i16_via_i128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i16 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[CONV6:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[CONV6]]
+;
+entry:
+  %sub = sub i16 0, %x
+  %and = and i16 %x, %sub
+  %mul = mul i16 %and, 2479
+  %0 = lshr i16 %mul, 12
+  %idxprom = zext nneg i16 %0 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @tablei128, i64 %idxprom
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv6 = zext i8 %1 to i32
+  ret i32 %conv6
+}
+
+; Same as above but the table is a little off
+ at tablei128b = internal unnamed_addr constant i128 16018378897745984667142068813250560256, align 16
+define i32 @cttz_i16_via_i128_incorrecttable(i16 noundef %x) {
+; CHECK-LABEL: @cttz_i16_via_i128_incorrecttable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i16 0, [[X:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X]], [[SUB]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i16 [[AND]], 2479
+; CHECK-NEXT:    [[TMP0:%.*]] = lshr i16 [[MUL]], 12
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i16 [[TMP0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr @tablei128b, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV6:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[CONV6]]
+;
+entry:
+  %sub = sub i16 0, %x
+  %and = and i16 %x, %sub
+  %mul = mul i16 %and, 2479
+  %0 = lshr i16 %mul, 12
+  %idxprom = zext nneg i16 %0 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @tablei128b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv6 = zext i8 %1 to i32
+  ret i32 %conv6
+}

>From 984319e03877a2acf380892e291792b9720b6a67 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 29 Jul 2025 08:08:14 +0100
Subject: [PATCH 3/5] Attempt to get i8 gep offsets working

---
 .../AggressiveInstCombine.cpp                 | 26 +++----
 .../lower-table-based-cttz-basics.ll          | 67 ++++++++++++++++---
 .../negative-lower-table-based-cttz.ll        |  2 +-
 3 files changed, 73 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index f2986d9734528..12cb379558b1b 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -457,13 +457,12 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
 
 // Check if this array of constants represents a cttz table.
 // Iterate over the elements from \p Table by trying to find/match all
-// the numbers from 0 to \p InputTy->getSizeInBits() that should represent cttz
-// results.
+// the numbers from 0 to \p InputBits that should represent cttz results.
 static bool isCTTZTable(Constant *Table, uint64_t Mul, uint64_t Shift,
-                        Type *AccessTy, unsigned InputBits,
+                        uint64_t AndMask, Type *AccessTy, unsigned InputBits,
                         unsigned GEPIdxFactor, const DataLayout &DL) {
   for (unsigned Idx = 0; Idx < InputBits; Idx++) {
-    APInt Index = (APInt(InputBits, 1ull << Idx) * Mul).lshr(Shift);
+    APInt Index = (APInt(InputBits, 1ull << Idx) * Mul).lshr(Shift) & AndMask;
     ConstantInt *C = dyn_cast_or_null<ConstantInt>(
         ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
     if (!C || C->getZExtValue() != Idx)
@@ -558,26 +557,27 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
     return false;
 
   Value *X1;
-  uint64_t MulConst, ShiftConst;
+  uint64_t MulConst, ShiftConst, AndCst = ~0ull;
   // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
   // probably fail for other (e.g. 32-bit) targets.
   if (!match(GepIdx, m_ZExtOrSelf(m_LShr(
                          m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
                                m_ConstantInt(MulConst)),
-                         m_ConstantInt(ShiftConst)))))
+                         m_ConstantInt(ShiftConst)))) &&
+      !match(GepIdx, m_ZExtOrSelf(m_And(m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)),
+                                                             m_Deferred(X1)),
+                                                     m_ConstantInt(MulConst)),
+                                               m_ConstantInt(ShiftConst)),
+                                        m_ConstantInt(AndCst)))))
     return false;
 
   unsigned InputBits = X1->getType()->getScalarSizeInBits();
   if (InputBits != 16 && InputBits != 32 && InputBits != 64)
     return false;
 
-  // Shift should extract top 4..7 bits.
-  if (InputBits - Log2_32(InputBits) != ShiftConst &&
-      InputBits - Log2_32(InputBits) - 1 != ShiftConst)
-    return false;
-
-  if (!isCTTZTable(GVTable->getInitializer(), MulConst, ShiftConst, AccessType,
-                   InputBits, GEPSrcEltTy->getScalarSizeInBits() / 8, DL))
+  if (!isCTTZTable(GVTable->getInitializer(), MulConst, ShiftConst, AndCst,
+                   AccessType, InputBits,
+                   GEPSrcEltTy->getScalarSizeInBits() / 8, DL))
     return false;
 
   ConstantInt *ZeroTableElem = cast<ConstantInt>(
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
index 5732e146cd443..672d4f84265d4 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -190,6 +190,39 @@ return:                                           ; preds = %entry, %if.end
   ret i32 %retval.0
 }
 
+define i32 @ctz3_with_i8gep(i32 %x) {
+; CHECK-LABEL: @ctz3_with_i8gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[X]], i1 true)
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[TMP2]], [[IF_END]] ], [ 32, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %sub = sub i32 0, %x
+  %and = and i32 %x, %sub
+  %mul = mul i32 %and, 81224991
+  %0 = lshr i32 %mul, 25
+  %1 = and i32 %0, 124
+  %arrayidx.idx = zext nneg i32 %1 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @ctz3.table, i64 %arrayidx.idx
+  %2 = load i32, ptr %arrayidx, align 4
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  %retval.0 = phi i32 [ %2, %if.end ], [ 32, %entry ]
+  ret i32 %retval.0
+}
+
+
 @table = internal unnamed_addr constant [64 x i32] [i32 0, i32 1, i32 12, i32 2, i32 13, i32 22, i32 17, i32 3, i32 14, i32 33, i32 23, i32 36, i32 18, i32 58, i32 28, i32 4, i32 62, i32 15, i32 34, i32 26, i32 24, i32 48, i32 50, i32 37, i32 19, i32 55, i32 59, i32 52, i32 29, i32 44, i32 39, i32 5, i32 63, i32 11, i32 21, i32 16, i32 32, i32 35, i32 57, i32 27, i32 61, i32 25, i32 47, i32 49, i32 54, i32 51, i32 43, i32 38, i32 10, i32 20, i32 31, i32 56, i32 60, i32 46, i32 53, i32 42, i32 9, i32 30, i32 45, i32 41, i32 8, i32 40, i32 7, i32 6], align 4
 
 define i32 @ctz4(i64 %b) {
@@ -277,6 +310,30 @@ entry:
   ret i32 %0
 }
 
+;; This has a wrong table size but is otherwise fine.
+ at ctz9.table = internal unnamed_addr constant [128 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
+define i32 @ctz9(i32 %x) {
+; CHECK-LABEL: @ctz9(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds [128 x i8], ptr @ctz9.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
 define i32 @ctz1_with_i8_gep(i32 %x) {
 ; CHECK-LABEL: @ctz1_with_i8_gep(
 ; CHECK-NEXT:  entry:
@@ -328,14 +385,8 @@ entry:
 ; This is the same a ctz2_with_i8_gep but with the gep index multiplied by 2.
 define i32 @ctz2_with_i8_gep_fixed(i32 %x) {
 ; CHECK-LABEL: @ctz2_with_i8_gep_fixed(
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[X:%.*]]
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[SUB]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[AND]], 72416175
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 25
-; CHECK-NEXT:    [[SHR2:%.*]] = and i32 [[SHR]], 126
-; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[SHR2]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr @ctz2.table, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
diff --git a/llvm/test/Transforms/AggressiveInstCombine/negative-lower-table-based-cttz.ll b/llvm/test/Transforms/AggressiveInstCombine/negative-lower-table-based-cttz.ll
index 714acd7f16097..90836db80603b 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/negative-lower-table-based-cttz.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/negative-lower-table-based-cttz.ll
@@ -66,7 +66,7 @@ entry:
 
 ;; This is a negative test with a wrong table size and constants.
 
- at ctz3.table = internal unnamed_addr constant [128 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
+ at ctz3.table = internal unnamed_addr constant [128 x i8] c"\01\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
 
 define i32 @ctz5(i32 %x) {
 entry:

>From f6b9b8dfb7d3da753b46e575ecb73a8f428cde23 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 29 Jul 2025 14:11:42 +0100
Subject: [PATCH 4/5] Use collectOffset, add some explanation

---
 .../AggressiveInstCombine.cpp                 | 54 +++++++++----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 12cb379558b1b..12ff3c57394e5 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -460,7 +460,7 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
 // the numbers from 0 to \p InputBits that should represent cttz results.
 static bool isCTTZTable(Constant *Table, uint64_t Mul, uint64_t Shift,
                         uint64_t AndMask, Type *AccessTy, unsigned InputBits,
-                        unsigned GEPIdxFactor, const DataLayout &DL) {
+                        APInt GEPIdxFactor, const DataLayout &DL) {
   for (unsigned Idx = 0; Idx < InputBits; Idx++) {
     APInt Index = (APInt(InputBits, 1ull << Idx) * Mul).lshr(Shift) & AndMask;
     ConstantInt *C = dyn_cast_or_null<ConstantInt>(
@@ -484,6 +484,11 @@ static bool isCTTZTable(Constant *Table, uint64_t Mul, uint64_t Shift,
 // this can be lowered to `cttz` instruction.
 // There is also a special case when the element is 0.
 //
+// The (x & -x) sets the lowest non-zero bit to 1. The multiply is a de-bruijn
+// sequence that contains each patterns of bits in it. The shift extracts
+// the top bits after the multiply, and that index into the table should
+// represent the number of trailing zeros in the original number.
+//
 // Here are some examples or LLVM IR for a 64-bit target:
 //
 // CASE 1:
@@ -525,7 +530,7 @@ static bool isCTTZTable(Constant *Table, uint64_t Mul, uint64_t Shift,
 //     i64 %shr
 // %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
 //
-// All this can be lowered to @llvm.cttz.i32/64 intrinsic.
+// All these can be lowered to @llvm.cttz.i32/64 intrinsics.
 static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
   LoadInst *LI = dyn_cast<LoadInst>(&I);
   if (!LI)
@@ -539,45 +544,40 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
   if (!GEP || !GEP->hasNoUnsignedSignedWrap())
     return false;
 
-  Type *GEPSrcEltTy = GEP->getSourceElementType();
-  Value *GepIdx;
-  if (GEP->getNumIndices() == 2) {
-    if (!GEPSrcEltTy->isArrayTy() ||
-        !match(GEP->idx_begin()->get(), m_ZeroInt()))
-      return false;
-    GEPSrcEltTy = GEPSrcEltTy->getArrayElementType();
-    GepIdx = std::next(GEP->idx_begin())->get();
-  } else if (GEP->getNumIndices() == 1)
-    GepIdx = GEP->idx_begin()->get();
-  else
-    return false;
-
   GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
   if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
     return false;
 
+  unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+  APInt ModOffset(BW, 0);
+  SmallMapVector<Value *, APInt, 4> VarOffsets;
+  if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
+      VarOffsets.size() != 1 || ModOffset != 0)
+    return false;
+  auto [GepIdx, GEPScale] = VarOffsets.front();
+
   Value *X1;
   uint64_t MulConst, ShiftConst, AndCst = ~0ull;
+  // Check that the gep variable index is ((x & -x) * MulConst) >> ShiftConst.
+  // This might be extended to the pointer index type, and if the gep index type
+  // has been replaced with an i8 then a new And (and different ShiftConst) will
+  // be present.
   // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
   // probably fail for other (e.g. 32-bit) targets.
-  if (!match(GepIdx, m_ZExtOrSelf(m_LShr(
-                         m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
-                               m_ConstantInt(MulConst)),
-                         m_ConstantInt(ShiftConst)))) &&
-      !match(GepIdx, m_ZExtOrSelf(m_And(m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)),
-                                                             m_Deferred(X1)),
-                                                     m_ConstantInt(MulConst)),
-                                               m_ConstantInt(ShiftConst)),
-                                        m_ConstantInt(AndCst)))))
+  auto MatchInner = m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
+                                 m_ConstantInt(MulConst)),
+                           m_ConstantInt(ShiftConst));
+  if (!match(GepIdx, m_ZExtOrSelf(MatchInner)) &&
+      !match(GepIdx, m_ZExtOrSelf(m_And(MatchInner, m_ConstantInt(AndCst)))))
     return false;
 
   unsigned InputBits = X1->getType()->getScalarSizeInBits();
   if (InputBits != 16 && InputBits != 32 && InputBits != 64)
     return false;
 
-  if (!isCTTZTable(GVTable->getInitializer(), MulConst, ShiftConst, AndCst,
-                   AccessType, InputBits,
-                   GEPSrcEltTy->getScalarSizeInBits() / 8, DL))
+  if (!GEPScale.isIntN(InputBits) ||
+      !isCTTZTable(GVTable->getInitializer(), MulConst, ShiftConst, AndCst,
+                   AccessType, InputBits, GEPScale.trunc(InputBits), DL))
     return false;
 
   ConstantInt *ZeroTableElem = cast<ConstantInt>(

>From cfa67d889f52703ad1c4d8bf747f0434880b81ba Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 31 Jul 2025 10:04:57 +0100
Subject: [PATCH 5/5] Get i128 working too

---
 .../AggressiveInstCombine.cpp                 | 34 +++++++-------
 .../lower-table-based-cttz-basics.ll          | 46 +++++++++++++++++++
 2 files changed, 63 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 12ff3c57394e5..40a7f8043034e 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -458,14 +458,15 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
 // Check if this array of constants represents a cttz table.
 // Iterate over the elements from \p Table by trying to find/match all
 // the numbers from 0 to \p InputBits that should represent cttz results.
-static bool isCTTZTable(Constant *Table, uint64_t Mul, uint64_t Shift,
-                        uint64_t AndMask, Type *AccessTy, unsigned InputBits,
-                        APInt GEPIdxFactor, const DataLayout &DL) {
+static bool isCTTZTable(Constant *Table, const APInt &Mul, const APInt &Shift,
+                        const APInt &AndMask, Type *AccessTy,
+                        unsigned InputBits, const APInt &GEPIdxFactor,
+                        const DataLayout &DL) {
   for (unsigned Idx = 0; Idx < InputBits; Idx++) {
-    APInt Index = (APInt(InputBits, 1ull << Idx) * Mul).lshr(Shift) & AndMask;
+    APInt Index = (APInt(InputBits, 1).shl(Idx) * Mul).lshr(Shift) & AndMask;
     ConstantInt *C = dyn_cast_or_null<ConstantInt>(
         ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
-    if (!C || C->getZExtValue() != Idx)
+    if (!C || C->getValue() != Idx)
       return false;
   }
 
@@ -485,7 +486,7 @@ static bool isCTTZTable(Constant *Table, uint64_t Mul, uint64_t Shift,
 // There is also a special case when the element is 0.
 //
 // The (x & -x) sets the lowest non-zero bit to 1. The multiply is a de-bruijn
-// sequence that contains each patterns of bits in it. The shift extracts
+// sequence that contains each pattern of bits in it. The shift extracts
 // the top bits after the multiply, and that index into the table should
 // represent the number of trailing zeros in the original number.
 //
@@ -557,27 +558,26 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
   auto [GepIdx, GEPScale] = VarOffsets.front();
 
   Value *X1;
-  uint64_t MulConst, ShiftConst, AndCst = ~0ull;
+  const APInt *MulConst, *ShiftConst, *AndCst = nullptr;
   // Check that the gep variable index is ((x & -x) * MulConst) >> ShiftConst.
   // This might be extended to the pointer index type, and if the gep index type
   // has been replaced with an i8 then a new And (and different ShiftConst) will
   // be present.
-  // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
-  // probably fail for other (e.g. 32-bit) targets.
-  auto MatchInner = m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
-                                 m_ConstantInt(MulConst)),
-                           m_ConstantInt(ShiftConst));
-  if (!match(GepIdx, m_ZExtOrSelf(MatchInner)) &&
-      !match(GepIdx, m_ZExtOrSelf(m_And(MatchInner, m_ConstantInt(AndCst)))))
+  auto MatchInner = m_LShr(
+      m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)), m_APInt(MulConst)),
+      m_APInt(ShiftConst));
+  if (!match(GepIdx, m_CastOrSelf(MatchInner)) &&
+      !match(GepIdx, m_CastOrSelf(m_And(MatchInner, m_APInt(AndCst)))))
     return false;
 
   unsigned InputBits = X1->getType()->getScalarSizeInBits();
-  if (InputBits != 16 && InputBits != 32 && InputBits != 64)
+  if (InputBits != 16 && InputBits != 32 && InputBits != 64 && InputBits != 128)
     return false;
 
   if (!GEPScale.isIntN(InputBits) ||
-      !isCTTZTable(GVTable->getInitializer(), MulConst, ShiftConst, AndCst,
-                   AccessType, InputBits, GEPScale.trunc(InputBits), DL))
+      !isCTTZTable(GVTable->getInitializer(), *MulConst, *ShiftConst,
+                   AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
+                   InputBits, GEPScale.zextOrTrunc(InputBits), DL))
     return false;
 
   ConstantInt *ZeroTableElem = cast<ConstantInt>(
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
index 672d4f84265d4..bb3001e909b8c 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -452,3 +452,49 @@ entry:
   %conv6 = zext i8 %1 to i32
   ret i32 %conv6
 }
+
+; Same as ctz1 but the table and load is very large
+ at ctz7i128.table = internal unnamed_addr constant [32 x i128] [i128 0, i128 1, i128 28, i128 2, i128 29, i128 14, i128 24, i128 3, i128 30, i128 22, i128 20, i128 15, i128 25, i128 17, i128 4, i128 8, i128 31, i128 27, i128 13, i128 23, i128 21, i128 19, i128 16, i128 7, i128 26, i128 12, i128 18, i128 6, i128 11, i128 5, i128 10, i128 9], align 16
+define i128 @ctz1_i128(i32 %x) {
+; CHECK-LABEL: @ctz1_i128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP2]] to i128
+; CHECK-NEXT:    ret i128 [[TMP3]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds [32 x i128], ptr @ctz7i128.table, i64 0, i64 %idxprom
+  %l = load i128, ptr %arrayidx, align 1
+  ret i128 %l
+}
+
+; This is roughly the same as ctz1 but using i128.
+ at table.i128 = internal unnamed_addr constant [128 x i8] c"\00\01e\02tf<\03|ug^R=!\04}yvWoh_5ZSE>0\22\14\05~rzPwmX.pkiI`K6\1Ab[TBMF?'81*#\1C\15\0E\06\7Fds;{]Q xVn4YD/\13qOl-jHJ\19aAL&7)\1B\0Dc:\\\1FU3C\12N,G\18@%(\0C9\1E2\11+\17$\0B\1D\10\16\0A\0F\09\08\07", align 1
+define i32 @src(i128 noundef %x) {
+; CHECK-LABEL: @src(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i128 @llvm.cttz.i128(i128 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i128 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i128 0, i128 [[TMP3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i128 [[TMP2]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i128 0, %x
+  %and = and i128 %x, %sub
+  %mul = mul i128 %and, 2647824804797170443043024478319300753
+  %shr = lshr i128 %mul, 121
+  %idxprom = trunc i128 %shr to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @table.i128, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}