[llvm] [AggressiveInstCombine] Make cttz fold more resiliant to non-array geps (PR #150896)

Tue Jul 29 00:48:10 PDT 2025

https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/150896

>From d05b6407c7b5e29549944826f3266d7985fab343 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 28 Jul 2025 08:29:52 +0100
Subject: [PATCH 1/3] [AggressiveInstCombine] Make cttz fold more resiliant to
 non-array geps

---
 .../AggressiveInstCombine.cpp                 | 32 +++++++------
 .../lower-table-based-cttz-basics.ll          | 47 +++++++++++++++++++
 .../PhaseOrdering/lower-table-based-cttz.ll   | 42 +++++++++++++++--
 3 files changed, 101 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 7af5ba4e0e103..975590214ffa6 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -547,14 +547,20 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
     return false;
 
   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
-  if (!GEP || !GEP->hasNoUnsignedSignedWrap() || GEP->getNumIndices() != 2)
+  if (!GEP || !GEP->hasNoUnsignedSignedWrap())
     return false;
 
-  if (!GEP->getSourceElementType()->isArrayTy())
-    return false;
-
-  uint64_t ArraySize = GEP->getSourceElementType()->getArrayNumElements();
-  if (ArraySize != 32 && ArraySize != 64)
+  Type *GEPSrcEltTy = GEP->getSourceElementType();
+  Value *GepIdx;
+  if (GEP->getNumIndices() == 2) {
+    if (!GEPSrcEltTy->isArrayTy() ||
+        !match(GEP->idx_begin()->get(), m_ZeroInt()))
+      return false;
+    GEPSrcEltTy = GEPSrcEltTy->getArrayElementType();
+    GepIdx = std::next(GEP->idx_begin())->get();
+  } else if (GEP->getNumIndices() == 1)
+    GepIdx = GEP->idx_begin()->get();
+  else
     return false;
 
   GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
@@ -563,21 +569,17 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
 
   ConstantDataArray *ConstData =
       dyn_cast<ConstantDataArray>(GVTable->getInitializer());
-  if (!ConstData)
-    return false;
-
-  if (!match(GEP->idx_begin()->get(), m_ZeroInt()))
+  if (!ConstData || ConstData->getElementType() != GEPSrcEltTy)
     return false;
 
-  Value *Idx2 = std::next(GEP->idx_begin())->get();
   Value *X1;
   uint64_t MulConst, ShiftConst;
   // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
   // probably fail for other (e.g. 32-bit) targets.
-  if (!match(Idx2, m_ZExtOrSelf(
-                       m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
-                                    m_ConstantInt(MulConst)),
-                              m_ConstantInt(ShiftConst)))))
+  if (!match(GepIdx, m_ZExtOrSelf(m_LShr(
+                         m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
+                               m_ConstantInt(MulConst)),
+                         m_ConstantInt(ShiftConst)))))
     return false;
 
   unsigned InputBits = X1->getType()->getScalarSizeInBits();
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
index 4d571999df372..0bfa891a7887c 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -276,3 +276,50 @@ entry:
   %0 = load i32, ptr %arrayidx, align 4
   ret i32 %0
 }
+
+define i32 @ctz1_with_i8_gep(i32 %x) {
+; CHECK-LABEL: @ctz1_with_i8_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds i8, ptr @ctz7.table, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @ctz2_with_i8_gep(i32 %x) {
+; CHECK-LABEL: @ctz2_with_i8_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SUB]], [[X]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[AND]], 72416175
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 26
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[SHR]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 72416175
+  %shr = lshr i32 %mul, 26
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 %idxprom
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  ret i32 %conv
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll b/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll
index 19fbc1f1ae64e..4455016c3e4a4 100644
--- a/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll
+++ b/llvm/test/Transforms/PhaseOrdering/lower-table-based-cttz.ll
@@ -1,3 +1,6 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -O3 -S < %s | FileCheck %s
+
 ;; This tests lowering of the implementations of table-based ctz
 ;; algorithm to the llvm.cttz instruction in the -O3 case.
 
@@ -13,13 +16,17 @@
 ;; }
 ;; Compiled as: clang -O3 test.c -S -emit-llvm -Xclang -disable-llvm-optzns
 
-; RUN: opt -O3 -S < %s | FileCheck %s
-
-; CHECK: call range(i32 0, 33) i32 @llvm.cttz.i32
-
 @ctz1.table = internal constant [32 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 16
 
-define i32 @ctz1(i32 noundef %x) {
+define i32 @ctz(i32 noundef %x) {
+; CHECK-LABEL: define range(i32 0, 32) i32 @ctz(
+; CHECK-SAME: i32 noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
 entry:
   %x.addr = alloca i32, align 4
   store i32 %x, ptr %x.addr, align 4
@@ -35,3 +42,28 @@ entry:
   %conv = sext i8 %2 to i32
   ret i32 %conv
 }
+
+define i32 @ctz_nonarraygep(i32 noundef %x) {
+; CHECK-LABEL: define range(i32 0, 32) i32 @ctz_nonarraygep(
+; CHECK-SAME: i32 noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 [[X]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %x.addr = alloca i32, align 4
+  store i32 %x, ptr %x.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %1 = load i32, ptr %x.addr, align 4
+  %sub = sub i32 0, %1
+  %and = and i32 %0, %sub
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds i8, ptr @ctz1.table, i64 %idxprom
+  %2 = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %2 to i32
+  ret i32 %conv
+}

>From 96fa795c3d4e61a2e62be2e5fe52bcc011a3c530 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Mon, 28 Jul 2025 21:33:16 +0100
Subject: [PATCH 2/3] Use ConstantFoldLoadFromConst, and get i16 tables working
 as a result

---
 .../AggressiveInstCombine.cpp                 | 58 +++++--------
 .../lower-table-based-cttz-basics.ll          | 82 ++++++++++++++++++-
 2 files changed, 102 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 975590214ffa6..f2986d9734528 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -457,30 +457,20 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
 
 // Check if this array of constants represents a cttz table.
 // Iterate over the elements from \p Table by trying to find/match all
-// the numbers from 0 to \p InputBits that should represent cttz results.
-static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
-                        uint64_t Shift, uint64_t InputBits) {
-  unsigned Length = Table.getNumElements();
-  if (Length < InputBits || Length > InputBits * 2)
-    return false;
-
-  APInt Mask = APInt::getBitsSetFrom(InputBits, Shift);
-  unsigned Matched = 0;
-
-  for (unsigned i = 0; i < Length; i++) {
-    uint64_t Element = Table.getElementAsInteger(i);
-    if (Element >= InputBits)
-      continue;
-
-    // Check if \p Element matches a concrete answer. It could fail for some
-    // elements that are never accessed, so we keep iterating over each element
-    // from the table. The number of matched elements should be equal to the
-    // number of potential right answers which is \p InputBits actually.
-    if ((((Mul << Element) & Mask.getZExtValue()) >> Shift) == i)
-      Matched++;
+// the numbers from 0 to \p InputTy->getSizeInBits() that should represent cttz
+// results.
+static bool isCTTZTable(Constant *Table, uint64_t Mul, uint64_t Shift,
+                        Type *AccessTy, unsigned InputBits,
+                        unsigned GEPIdxFactor, const DataLayout &DL) {
+  for (unsigned Idx = 0; Idx < InputBits; Idx++) {
+    APInt Index = (APInt(InputBits, 1ull << Idx) * Mul).lshr(Shift);
+    ConstantInt *C = dyn_cast_or_null<ConstantInt>(
+        ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
+    if (!C || C->getZExtValue() != Idx)
+      return false;
   }
 
-  return Matched == InputBits;
+  return true;
 }
 
 // Try to recognize table-based ctz implementation.
@@ -537,7 +527,7 @@ static bool isCTTZTable(const ConstantDataArray &Table, uint64_t Mul,
 // %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
 //
 // All this can be lowered to @llvm.cttz.i32/64 intrinsic.
-static bool tryToRecognizeTableBasedCttz(Instruction &I) {
+static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
   LoadInst *LI = dyn_cast<LoadInst>(&I);
   if (!LI)
     return false;
@@ -567,11 +557,6 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
   if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
     return false;
 
-  ConstantDataArray *ConstData =
-      dyn_cast<ConstantDataArray>(GVTable->getInitializer());
-  if (!ConstData || ConstData->getElementType() != GEPSrcEltTy)
-    return false;
-
   Value *X1;
   uint64_t MulConst, ShiftConst;
   // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
@@ -583,19 +568,21 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
     return false;
 
   unsigned InputBits = X1->getType()->getScalarSizeInBits();
-  if (InputBits != 32 && InputBits != 64)
+  if (InputBits != 16 && InputBits != 32 && InputBits != 64)
     return false;
 
-  // Shift should extract top 5..7 bits.
+  // Shift should extract top 4..7 bits.
   if (InputBits - Log2_32(InputBits) != ShiftConst &&
       InputBits - Log2_32(InputBits) - 1 != ShiftConst)
     return false;
 
-  if (!isCTTZTable(*ConstData, MulConst, ShiftConst, InputBits))
+  if (!isCTTZTable(GVTable->getInitializer(), MulConst, ShiftConst, AccessType,
+                   InputBits, GEPSrcEltTy->getScalarSizeInBits() / 8, DL))
     return false;
 
-  auto ZeroTableElem = ConstData->getElementAsInteger(0);
-  bool DefinedForZero = ZeroTableElem == InputBits;
+  ConstantInt *ZeroTableElem = cast<ConstantInt>(
+      ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
+  bool DefinedForZero = ZeroTableElem->getZExtValue() == InputBits;
 
   IRBuilder<> B(LI);
   ConstantInt *BoolConst = B.getInt1(!DefinedForZero);
@@ -609,8 +596,7 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I) {
     // If the value in elem 0 isn't the same as InputBits, we still want to
     // produce the value from the table.
     auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0));
-    auto Select =
-        B.CreateSelect(Cmp, ConstantInt::get(XType, ZeroTableElem), Cttz);
+    auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz);
 
     // NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target
     // it should be handled as: `cttz(x) & (typeSize - 1)`.
@@ -1479,7 +1465,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       MadeChange |= foldGuardedFunnelShift(I, DT);
       MadeChange |= tryToRecognizePopCount(I);
       MadeChange |= tryToFPToSat(I, TTI);
-      MadeChange |= tryToRecognizeTableBasedCttz(I);
+      MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
       MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
       MadeChange |= foldPatternedLoads(I, DL);
       MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
index 0bfa891a7887c..5732e146cd443 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -299,6 +299,7 @@ entry:
   ret i32 %conv
 }
 
+; This is the same a ctz2 (i16 table) with an i8 gep making the indices invalid
 define i32 @ctz2_with_i8_gep(i32 %x) {
 ; CHECK-LABEL: @ctz2_with_i8_gep(
 ; CHECK-NEXT:  entry:
@@ -308,7 +309,7 @@ define i32 @ctz2_with_i8_gep(i32 %x) {
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 26
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[SHR]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
@@ -319,7 +320,84 @@ entry:
   %shr = lshr i32 %mul, 26
   %idxprom = zext i32 %shr to i64
   %arrayidx = getelementptr inbounds [64 x i8], ptr @ctz2.table, i64 0, i64 %idxprom
-  %0 = load i16, ptr %arrayidx, align 2
+  %0 = load i16, ptr %arrayidx, align 1
   %conv = sext i16 %0 to i32
   ret i32 %conv
 }
+
+; This is the same a ctz2_with_i8_gep but with the gep index multiplied by 2.
+define i32 @ctz2_with_i8_gep_fixed(i32 %x) {
+; CHECK-LABEL: @ctz2_with_i8_gep_fixed(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[SUB]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[AND]], 72416175
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 25
+; CHECK-NEXT:    [[SHR2:%.*]] = and i32 [[SHR]], 126
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[SHR2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr @ctz2.table, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %sub = sub i32 0, %x
+  %and = and i32 %x, %sub
+  %mul = mul i32 %and, 72416175
+  %shr = lshr i32 %mul, 25
+  %shr2 = and i32 %shr, 126
+  %1 = zext nneg i32 %shr2 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @ctz2.table, i64 %1
+  %2 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %2 to i32
+  ret i32 %conv
+}
+
+; This is a i16 input with the debruijn table stored in a single i128.
+ at tablei128 = internal unnamed_addr constant i128 16018378897745984667142067713738932480, align 16
+define i32 @cttz_i16_via_i128(i16 noundef %x) {
+; CHECK-LABEL: @cttz_i16_via_i128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i16 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP3]], i16 0, i16 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[CONV6:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[CONV6]]
+;
+entry:
+  %sub = sub i16 0, %x
+  %and = and i16 %x, %sub
+  %mul = mul i16 %and, 2479
+  %0 = lshr i16 %mul, 12
+  %idxprom = zext nneg i16 %0 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @tablei128, i64 %idxprom
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv6 = zext i8 %1 to i32
+  ret i32 %conv6
+}
+
+; Same as above but the table is a little off
+ at tablei128b = internal unnamed_addr constant i128 16018378897745984667142068813250560256, align 16
+define i32 @cttz_i16_via_i128_incorrecttable(i16 noundef %x) {
+; CHECK-LABEL: @cttz_i16_via_i128_incorrecttable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i16 0, [[X:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X]], [[SUB]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i16 [[AND]], 2479
+; CHECK-NEXT:    [[TMP0:%.*]] = lshr i16 [[MUL]], 12
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i16 [[TMP0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr @tablei128b, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV6:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[CONV6]]
+;
+entry:
+  %sub = sub i16 0, %x
+  %and = and i16 %x, %sub
+  %mul = mul i16 %and, 2479
+  %0 = lshr i16 %mul, 12
+  %idxprom = zext nneg i16 %0 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @tablei128b, i64 %idxprom
+  %1 = load i8, ptr %arrayidx, align 1
+  %conv6 = zext i8 %1 to i32
+  ret i32 %conv6
+}

>From 67cf7079d377cc950723f7c345ecebe0ca675f23 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 29 Jul 2025 08:08:14 +0100
Subject: [PATCH 3/3] Attempt to get i8 gep offsets working

---
 .../AggressiveInstCombine.cpp                 | 26 +++----
 .../lower-table-based-cttz-basics.ll          | 67 ++++++++++++++++---
 .../negative-lower-table-based-cttz.ll        |  2 +-
 3 files changed, 73 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index f2986d9734528..12cb379558b1b 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -457,13 +457,12 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
 
 // Check if this array of constants represents a cttz table.
 // Iterate over the elements from \p Table by trying to find/match all
-// the numbers from 0 to \p InputTy->getSizeInBits() that should represent cttz
-// results.
+// the numbers from 0 to \p InputBits that should represent cttz results.
 static bool isCTTZTable(Constant *Table, uint64_t Mul, uint64_t Shift,
-                        Type *AccessTy, unsigned InputBits,
+                        uint64_t AndMask, Type *AccessTy, unsigned InputBits,
                         unsigned GEPIdxFactor, const DataLayout &DL) {
   for (unsigned Idx = 0; Idx < InputBits; Idx++) {
-    APInt Index = (APInt(InputBits, 1ull << Idx) * Mul).lshr(Shift);
+    APInt Index = (APInt(InputBits, 1ull << Idx) * Mul).lshr(Shift) & AndMask;
     ConstantInt *C = dyn_cast_or_null<ConstantInt>(
         ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
     if (!C || C->getZExtValue() != Idx)
@@ -558,26 +557,27 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
     return false;
 
   Value *X1;
-  uint64_t MulConst, ShiftConst;
+  uint64_t MulConst, ShiftConst, AndCst = ~0ull;
   // FIXME: 64-bit targets have `i64` type for the GEP index, so this match will
   // probably fail for other (e.g. 32-bit) targets.
   if (!match(GepIdx, m_ZExtOrSelf(m_LShr(
                          m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)),
                                m_ConstantInt(MulConst)),
-                         m_ConstantInt(ShiftConst)))))
+                         m_ConstantInt(ShiftConst)))) &&
+      !match(GepIdx, m_ZExtOrSelf(m_And(m_LShr(m_Mul(m_c_And(m_Neg(m_Value(X1)),
+                                                             m_Deferred(X1)),
+                                                     m_ConstantInt(MulConst)),
+                                               m_ConstantInt(ShiftConst)),
+                                        m_ConstantInt(AndCst)))))
     return false;
 
   unsigned InputBits = X1->getType()->getScalarSizeInBits();
   if (InputBits != 16 && InputBits != 32 && InputBits != 64)
     return false;
 
-  // Shift should extract top 4..7 bits.
-  if (InputBits - Log2_32(InputBits) != ShiftConst &&
-      InputBits - Log2_32(InputBits) - 1 != ShiftConst)
-    return false;
-
-  if (!isCTTZTable(GVTable->getInitializer(), MulConst, ShiftConst, AccessType,
-                   InputBits, GEPSrcEltTy->getScalarSizeInBits() / 8, DL))
+  if (!isCTTZTable(GVTable->getInitializer(), MulConst, ShiftConst, AndCst,
+                   AccessType, InputBits,
+                   GEPSrcEltTy->getScalarSizeInBits() / 8, DL))
     return false;
 
   ConstantInt *ZeroTableElem = cast<ConstantInt>(
diff --git a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
index 5732e146cd443..672d4f84265d4 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/lower-table-based-cttz-basics.ll
@@ -190,6 +190,39 @@ return:                                           ; preds = %entry, %if.end
   ret i32 %retval.0
 }
 
+define i32 @ctz3_with_i8gep(i32 %x) {
+; CHECK-LABEL: @ctz3_with_i8gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[X]], i1 true)
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[TMP2]], [[IF_END]] ], [ 32, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+entry:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %sub = sub i32 0, %x
+  %and = and i32 %x, %sub
+  %mul = mul i32 %and, 81224991
+  %0 = lshr i32 %mul, 25
+  %1 = and i32 %0, 124
+  %arrayidx.idx = zext nneg i32 %1 to i64
+  %arrayidx = getelementptr inbounds nuw i8, ptr @ctz3.table, i64 %arrayidx.idx
+  %2 = load i32, ptr %arrayidx, align 4
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  %retval.0 = phi i32 [ %2, %if.end ], [ 32, %entry ]
+  ret i32 %retval.0
+}
+
+
 @table = internal unnamed_addr constant [64 x i32] [i32 0, i32 1, i32 12, i32 2, i32 13, i32 22, i32 17, i32 3, i32 14, i32 33, i32 23, i32 36, i32 18, i32 58, i32 28, i32 4, i32 62, i32 15, i32 34, i32 26, i32 24, i32 48, i32 50, i32 37, i32 19, i32 55, i32 59, i32 52, i32 29, i32 44, i32 39, i32 5, i32 63, i32 11, i32 21, i32 16, i32 32, i32 35, i32 57, i32 27, i32 61, i32 25, i32 47, i32 49, i32 54, i32 51, i32 43, i32 38, i32 10, i32 20, i32 31, i32 56, i32 60, i32 46, i32 53, i32 42, i32 9, i32 30, i32 45, i32 41, i32 8, i32 40, i32 7, i32 6], align 4
 
 define i32 @ctz4(i64 %b) {
@@ -277,6 +310,30 @@ entry:
   ret i32 %0
 }
 
+;; This has a wrong table size but is otherwise fine.
+ at ctz9.table = internal unnamed_addr constant [128 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
+define i32 @ctz9(i32 %x) {
+; CHECK-LABEL: @ctz9(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 0, i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %sub = sub i32 0, %x
+  %and = and i32 %sub, %x
+  %mul = mul i32 %and, 125613361
+  %shr = lshr i32 %mul, 27
+  %idxprom = zext i32 %shr to i64
+  %arrayidx = getelementptr inbounds [128 x i8], ptr @ctz9.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
 define i32 @ctz1_with_i8_gep(i32 %x) {
 ; CHECK-LABEL: @ctz1_with_i8_gep(
 ; CHECK-NEXT:  entry:
@@ -328,14 +385,8 @@ entry:
 ; This is the same a ctz2_with_i8_gep but with the gep index multiplied by 2.
 define i32 @ctz2_with_i8_gep_fixed(i32 %x) {
 ; CHECK-LABEL: @ctz2_with_i8_gep_fixed(
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[X:%.*]]
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X]], [[SUB]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[AND]], 72416175
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], 25
-; CHECK-NEXT:    [[SHR2:%.*]] = and i32 [[SHR]], 126
-; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[SHR2]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr @ctz2.table, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-NEXT:    ret i32 [[CONV]]
 ;
diff --git a/llvm/test/Transforms/AggressiveInstCombine/negative-lower-table-based-cttz.ll b/llvm/test/Transforms/AggressiveInstCombine/negative-lower-table-based-cttz.ll
index 714acd7f16097..90836db80603b 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/negative-lower-table-based-cttz.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/negative-lower-table-based-cttz.ll
@@ -66,7 +66,7 @@ entry:
 
 ;; This is a negative test with a wrong table size and constants.
 
- at ctz3.table = internal unnamed_addr constant [128 x i8] c"\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
+ at ctz3.table = internal unnamed_addr constant [128 x i8] c"\01\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09\00\01\1C\02\1D\0E\18\03\1E\16\14\0F\19\11\04\08\1F\1B\0D\17\15\13\10\07\1A\0C\12\06\0B\05\0A\09", align 1
 
 define i32 @ctz5(i32 %x) {
 entry: