[llvm] [Transforms] LoopIdiomRecognize recognize strlen and wcslen (PR #108985)

Henry Jiang via llvm-commits llvm-commits at lists.llvm.org
Sun Feb 2 16:02:15 PST 2025


https://github.com/mustartt updated https://github.com/llvm/llvm-project/pull/108985

>From c37e74fa74e02c6693c93abdefcb225cd9d03e9d Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Sat, 7 Sep 2024 22:27:50 -0400
Subject: [PATCH 01/13] Initial upstreaming of strlen8 LIR 1 out of 3

---
 .../Transforms/Scalar/LoopIdiomRecognize.h    |   3 +
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 296 +++++++++++++++++-
 llvm/test/Transforms/LoopIdiom/strlen.ll      | 149 +++++++++
 3 files changed, 445 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopIdiom/strlen.ll

diff --git a/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h b/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
index 0c6406d8618518..3a9f016ce9bd60 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
@@ -34,6 +34,9 @@ struct DisableLIRP {
 
   /// When true, Memcpy is disabled.
   static bool Memcpy;
+
+  /// When true, Strlen is disabled.
+  static bool Strlen;
 };
 
 /// Performs Loop Idiom Recognize Pass.
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 05cf638d3f09df..1bcf7025cc1259 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -97,6 +97,7 @@ using namespace llvm;
 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
 STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
+STATISTIC(NumStrLen, "Number of strlen's formed from loop loads");
 STATISTIC(
     NumShiftUntilBitTest,
     "Number of uncountable loops recognized as 'shift until bitttest' idiom");
@@ -126,6 +127,14 @@ static cl::opt<bool, true>
                       cl::location(DisableLIRP::Memcpy), cl::init(false),
                       cl::ReallyHidden);
 
+bool DisableLIRP::Strlen;
+static cl::opt<bool, true>
+    DisableLIRPStrlen("disable-" DEBUG_TYPE "-strlen",
+                      cl::desc("Proceed with loop idiom recognize pass, but do "
+                               "not convert loop(s) to strlen."),
+                      cl::location(DisableLIRP::Strlen), cl::init(false),
+                      cl::ReallyHidden);
+
 static cl::opt<bool> UseLIRCodeSizeHeurs(
     "use-lir-code-size-heurs",
     cl::desc("Use loop idiom recognition code size heuristics when compiling"
@@ -246,6 +255,7 @@ class LoopIdiomRecognize {
 
   bool recognizeShiftUntilBitTest();
   bool recognizeShiftUntilZero();
+  bool recognizeAndInsertStrLen();
 
   /// @}
 };
@@ -1507,9 +1517,11 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
   if (!Cond)
     return nullptr;
 
-  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
-  if (!CmpZero || !CmpZero->isZero())
-    return nullptr;
+  if (!isa<ConstantPointerNull>(Cond->getOperand(1))) {
+    ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
+    if (!CmpZero || !CmpZero->isZero())
+      return nullptr;
+  }
 
   BasicBlock *TrueSucc = BI->getSuccessor(0);
   BasicBlock *FalseSucc = BI->getSuccessor(1);
@@ -1524,6 +1536,284 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
   return nullptr;
 }
 
+/// getCandidateResInstr - If there is strlen calculated, return the Result
+/// instruction based on the \p OpWidth passed, else return nullptr
+static Instruction *getCandidateResInstr(Instruction *EndAddress,
+                                         Value *StartAddress,
+                                         unsigned OpWidth) {
+  using namespace llvm::PatternMatch;
+
+  assert(StartAddress && "Valid start address required.");
+
+  // lambda expression to check that the instruction has a single user
+  auto GetSingleUser = [](Instruction *I) -> User * {
+    if (I->hasOneUse())
+      return *I->user_begin();
+    return nullptr;
+  };
+
+  // The pointer to the end address should only have one use which is a pointer
+  // to int instruction.
+  auto *TmpUser = GetSingleUser(EndAddress);
+  if (!TmpUser)
+    return nullptr;
+
+  if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(TmpUser)) {
+    // The only user of the PtrToIntInst should be the sub instruction that
+    // calculates the difference b/w the two pointer operands.
+    TmpUser = GetSingleUser(PToI);
+    if (!TmpUser)
+      return nullptr;
+    Instruction *Inst = dyn_cast<Instruction>(TmpUser);
+
+    if (!Inst || Inst->getOpcode() != Instruction::Sub ||
+        Inst->getOperand(0) != PToI)
+      return nullptr;
+    Value *MatchAddr;
+    if (match(Inst->getOperand(1), m_PtrToInt(m_Value(MatchAddr)))) {
+      if (MatchAddr != StartAddress)
+        return nullptr;
+
+      // We found the candidate sub instruction
+      switch (OpWidth) {
+      case 8:
+        return Inst;
+      default:
+        return nullptr;
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+/// Recognizes a strlen idiom by checking for loops that increment
+/// a char pointer and then subtract with the base pointer.
+///
+/// If detected, transforms the relevant code to a strlen function
+/// call, and returns true; otherwise, returns false.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///     if (str == NULL)
+///       goto loop-exit // the precondition of the loop
+///     start = str;
+///     do {
+///       str++;
+///     } while(*str!='\0');
+///     return (str - start);
+/// loop-exit:
+/// \endcode
+///
+/// The transformed output is similar to below c-code:
+/// \code
+///     if (str == NULL)
+///       goto loop-exit // the precondition of the loop
+///     return strlen(str);
+/// \endcode
+bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
+  if (DisableLIRPStrlen)
+    return false;
+
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  // It should have a preheader containing nothing but an unconditional branch.
+  auto *Pre = CurLoop->getLoopPreheader();
+  if (!Pre || &Pre->front() != Pre->getTerminator())
+    return false;
+
+  auto *EntryBI = dyn_cast<BranchInst>(Pre->getTerminator());
+  if (!EntryBI || EntryBI->isConditional())
+    return false;
+
+  // It should have a precondition block
+  auto *PreCondBB = Pre->getSinglePredecessor();
+  if (!PreCondBB)
+    return false;
+
+  // The precondition terminator instruction should skip the loop body based on
+  // an icmp with zero/null.
+  if (!matchCondition(dyn_cast<BranchInst>(PreCondBB->getTerminator()), Pre))
+    return false;
+
+  // The loop exit must be conditioned on an icmp with 0.
+  // The icmp operand has to be a load on some SSA reg that increments
+  // by 1 in the loop.
+  auto *LoopBody = *(CurLoop->block_begin());
+  auto *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
+  auto *LoopCond = matchCondition(LoopTerm, LoopBody);
+
+  if (!LoopCond)
+    return false;
+
+  auto *LoopLoad = dyn_cast<LoadInst>(LoopCond);
+  if (!LoopLoad || LoopLoad->getPointerAddressSpace() != 0)
+    return false;
+
+  Type *OperandType = LoopLoad->getType();
+  if (!OperandType || !OperandType->isIntegerTy())
+    return false;
+
+  // See if the pointer expression is an AddRec with step 1 ({n,+,1}) on
+  // the loop, indicating strlen calculation.
+  auto *IncPtr = LoopLoad->getPointerOperand();
+  const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IncPtr));
+  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+    return false;
+
+  const SCEVConstant *Step =
+      dyn_cast<SCEVConstant>(LoadEv->getStepRecurrence(*SE));
+  if (!Step)
+    return false;
+
+  unsigned int ConstIntValue = 0;
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Step->getValue()))
+    ConstIntValue = CI->getZExtValue();
+
+  unsigned OpWidth = OperandType->getIntegerBitWidth();
+  if (OpWidth != ConstIntValue * 8)
+    return false;
+  if (OpWidth != 8)
+    return false;
+
+  // Scan every instruction in the loop to ensure there are no side effects.
+  for (auto &I : *LoopBody)
+    if (I.mayHaveSideEffects())
+      return false;
+
+  auto *LoopExitBB = CurLoop->getExitBlock();
+  if (!LoopExitBB)
+    return false;
+
+  // Check that the loop exit block is valid:
+  // It needs to have exactly one LCSSA Phi which is an AddRec.
+  PHINode *LCSSAPhi = nullptr;
+  for (PHINode &PN : LoopExitBB->phis()) {
+    if (!LCSSAPhi && PN.getNumIncomingValues() == 1)
+      LCSSAPhi = &PN;
+    else
+      return false;
+  }
+
+  if (!LCSSAPhi || !SE->isSCEVable(LCSSAPhi->getType()))
+    return false;
+
+  if (LCSSAPhi->getIncomingValueForBlock(LoopBody) !=
+      LoopLoad->getPointerOperand())
+    return false;
+
+  const SCEVAddRecExpr *LCSSAEv =
+      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LCSSAPhi->getIncomingValue(0)));
+
+  if (!LCSSAEv || !dyn_cast<SCEVUnknown>(SE->getPointerBase(LCSSAEv)) ||
+      !LCSSAEv->isAffine())
+    return false;
+
+  // We can now expand the base of the str
+  IRBuilder<> Builder(Pre->getTerminator());
+
+  PHINode *LoopPhi = &*LoopBody->phis().begin();
+  if (!LoopPhi || ++LoopBody->phis().begin() != LoopBody->phis().end())
+    return false;
+  Value *PreVal = LoopBody->phis().begin()->getIncomingValueForBlock(Pre);
+  if (!PreVal)
+    return false;
+
+  Value *Expanded = nullptr;
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(LoopLoad->getPointerOperand())) {
+    if (GEP->getPointerOperand() != LoopPhi)
+      return false;
+    GetElementPtrInst *NewGEP =
+        GetElementPtrInst::Create(GEP->getSourceElementType(), PreVal,
+                                  SmallVector<Value *, 4>(GEP->indices()),
+                                  "newgep", Pre->getTerminator());
+    Expanded = NewGEP;
+  } else if (LoopLoad->getPointerOperand() == LoopPhi)
+    Expanded = PreVal;
+  if (!Expanded)
+    return false;
+
+  // Check that the LoopExitBB is calculating the string length and identify
+  // the instruction that has the string length calculation
+  Instruction *ResInst = getCandidateResInstr(LCSSAPhi, PreVal, OpWidth);
+  if (!ResInst)
+    return false;
+
+  // Ensure that the GEP has the correct index if the pointer was modified.
+  // This can happen when the pointer in the user code, outside the loop,
+  // walks past a certain pre-checked index of the string.
+  if (auto *GEP = dyn_cast<GEPOperator>(Expanded)) {
+    if (GEP->getNumOperands() != 2)
+      return false;
+
+    ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
+    if (!I0)
+      return false;
+
+    int64_t Index = I0->getSExtValue(); // GEP index
+    auto *SAdd = dyn_cast<SCEVAddExpr>(LoadEv->getStart());
+    if (!SAdd || SAdd->getNumOperands() != 2)
+      return false;
+
+    auto *SAdd0 = dyn_cast<SCEVConstant>(SAdd->getOperand(0));
+    if (!SAdd0)
+      return false;
+
+    ConstantInt *CInt = SAdd0->getValue(); // SCEV index
+    assert(CInt && "Expecting CInt to be valid.");
+    int64_t Offset = CInt->getSExtValue();
+
+    // Update the index based on the Offset
+    assert((Offset * 8) % GEP->getSourceElementType()->getIntegerBitWidth() ==
+               0 &&
+           "Invalid offset");
+    int64_t NewIndex =
+        (Offset * 8) / GEP->getSourceElementType()->getIntegerBitWidth() -
+        Index;
+    Value *NewIndexVal =
+        ConstantInt::get(GEP->getOperand(1)->getType(), NewIndex);
+    GEP->setOperand(1, NewIndexVal);
+  }
+
+  Value *StrLenFunc = nullptr;
+  switch (OpWidth) {
+  case 8:
+    StrLenFunc = emitStrLen(Expanded, Builder, *DL, TLI);
+    break;
+  }
+
+  assert(StrLenFunc && "Failed to emit strlen function.");
+
+  // Replace the subtraction instruction by the result of strlen
+  ResInst->replaceAllUsesWith(StrLenFunc);
+
+  // Remove the loop-exit branch and delete dead instructions
+  RecursivelyDeleteTriviallyDeadInstructions(ResInst, TLI);
+
+  ConstantInt *NewLoopCond = LoopTerm->getSuccessor(0) == LoopBody
+                                 ? Builder.getFalse()
+                                 : Builder.getTrue();
+  LoopTerm->setCondition(NewLoopCond);
+
+  deleteDeadInstruction(cast<Instruction>(LoopCond));
+  deleteDeadInstruction(cast<Instruction>(IncPtr));
+  SE->forgetLoop(CurLoop);
+
+  LLVM_DEBUG(dbgs() << "  Formed strlen: " << *StrLenFunc << "\n");
+
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "recognizeAndInsertStrLen",
+                              CurLoop->getStartLoc(), Pre)
+           << "Transformed pointer difference into a call to strlen() function";
+  });
+
+  ++NumStrLen;
+
+  return true;
+}
+
 /// Check if the given conditional branch is based on an unsigned less-than
 /// comparison between a variable and a constant, and if the comparison is false
 /// the control yields to the loop entry. If the branch matches the behaviour,
diff --git a/llvm/test/Transforms/LoopIdiom/strlen.ll b/llvm/test/Transforms/LoopIdiom/strlen.ll
new file mode 100644
index 00000000000000..641fce0da8b785
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/strlen.ll
@@ -0,0 +1,149 @@
+; RUN: opt -passes='loop-idiom' < %s -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+define i64 @valid_strlen_i8_test1(ptr %Str) {
+; CHECK-LABEL: @valid_strlen_i8_test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq ptr [[STR:%.*]], null
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
+; CHECK:       lor.lhs.false:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[STR]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[CLEANUP]], label [[FOR_INC_PREHEADER:%.*]]
+; CHECK:       for.inc.preheader:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[STR]], i64 0
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[SCEVGEP]])
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[SRC_09:%.*]] = phi ptr [ poison, [[FOR_INC]] ], [ [[STR]], [[FOR_INC_PREHEADER]] ]
+; CHECK-NEXT:    [[TOBOOL2:%.*]] = icmp eq i8 poison, 0
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_INC]]
+; CHECK:       for.end:
+; CHECK-NEXT:    br label [[CLEANUP]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ [[STRLEN]], [[FOR_END]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[LOR_LHS_FALSE]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL_0]]
+;
+entry:
+  %tobool = icmp eq ptr %Str, null
+  br i1 %tobool, label %cleanup, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %0 = load i8, ptr %Str, align 1
+  %cmp = icmp eq i8 %0, 0
+  br i1 %cmp, label %cleanup, label %for.inc
+
+for.inc:                                          ; preds = %lor.lhs.false, %for.inc
+  %Src.09 = phi ptr [ %incdec.ptr, %for.inc ], [ %Str, %lor.lhs.false ]
+  %incdec.ptr = getelementptr inbounds i8, ptr %Src.09, i64 1
+  %.pr = load i8, ptr %incdec.ptr, align 1
+  %tobool2 = icmp eq i8 %.pr, 0
+  br i1 %tobool2, label %for.end, label %for.inc
+
+for.end:                                          ; preds = %for.inc
+  %sub.ptr.lhs.cast = ptrtoint ptr %incdec.ptr to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %Str to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  br label %cleanup
+
+cleanup:                                          ; preds = %lor.lhs.false, %entry, %for.end
+  %retval.0 = phi i64 [ %sub.ptr.sub, %for.end ], [ 0, %entry ], [ 0, %lor.lhs.false ]
+  ret i64 %retval.0
+}
+
+define i64 @valid_strlen_i8_test2(ptr %Str) {
+; CHECK-LABEL: @valid_strlen_i8_test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq ptr [[STR:%.*]], null
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[CLEANUP:%.*]], label [[FOR_COND_PREHEADER:%.*]]
+; CHECK:       for.cond.preheader:
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[STR]])
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TOBOOL1:%.*]] = icmp eq i8 poison, 0
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, ptr poison, i64 1
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_COND]]
+; CHECK:       for.end:
+; CHECK-NEXT:    br label [[CLEANUP]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ [[STRLEN]], [[FOR_END]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL_0]]
+;
+entry:
+  %tobool = icmp eq ptr %Str, null
+  br i1 %tobool, label %cleanup, label %for.cond
+
+for.cond:                                         ; preds = %entry, %for.cond
+  %Src.0 = phi ptr [ %incdec.ptr, %for.cond ], [ %Str, %entry ]
+  %0 = load i8, ptr %Src.0, align 1
+  %tobool1 = icmp eq i8 %0, 0
+  %incdec.ptr = getelementptr inbounds i8, ptr %Src.0, i64 1
+  br i1 %tobool1, label %for.end, label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %sub.ptr.lhs.cast = ptrtoint ptr %Src.0 to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %Str to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  br label %cleanup
+
+  cleanup:                                          ; preds = %entry, %for.end
+  %retval.0 = phi i64 [ %sub.ptr.sub, %for.end ], [ 0, %entry ]
+  ret i64 %retval.0
+}
+
+define void @invalid_strlen_i8_test3(ptr %s, i32 zeroext %i) {
+; CHECK-LABEL: @invalid_strlen_i8_test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[S_ADDR_0:%.*]] = phi ptr [ [[S:%.*]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR1:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[S_ADDR_0]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i8, ptr [[S_ADDR_0]], i64 1
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END:%.*]], label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[S_ADDR_0_LCSSA:%.*]] = phi ptr [ [[S_ADDR_0]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[INCDEC_PTR1_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR1]], [[WHILE_COND]] ]
+; CHECK-NEXT:    store i8 45, ptr [[S_ADDR_0_LCSSA]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I:%.*]], 10
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i8 65, ptr [[INCDEC_PTR1_LCSSA]], align 1
+; CHECK-NEXT:    br label [[IF_END9:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    store i8 66, ptr [[INCDEC_PTR1_LCSSA]], align 1
+; CHECK-NEXT:    br label [[IF_END9]]
+; CHECK:       if.end9:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %s.addr.0 = phi ptr [ %s, %entry ], [ %incdec.ptr1, %while.cond ]
+  %0 = load i8, ptr %s.addr.0, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  %incdec.ptr1 = getelementptr inbounds i8, ptr %s.addr.0, i64 1
+  br i1 %tobool.not, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  %s.addr.0.lcssa = phi ptr [ %s.addr.0, %while.cond ]
+  %incdec.ptr1.lcssa = phi ptr [ %incdec.ptr1, %while.cond ]
+  store i8 45, ptr %s.addr.0.lcssa, align 1
+  %cmp = icmp ult i32 %i, 10
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %while.end
+  store i8 65, ptr %incdec.ptr1.lcssa, align 1
+  br label %if.end9
+
+if.end:                                           ; preds = %while.end
+  store i8 66, ptr %incdec.ptr1.lcssa, align 1
+  br label %if.end9
+
+if.end9:                                          ; preds = %if.end, %if.then
+  ret void
+}
+

>From 46af91d4b02e1e75d3c321f377dc7b9033820dfe Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Sun, 8 Sep 2024 13:17:03 -0400
Subject: [PATCH 02/13] enable strlen insert

---
 llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 1bcf7025cc1259..cbc5ed40947d79 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1499,7 +1499,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
 
   return recognizePopcount() || recognizeAndInsertFFS() ||
          recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||
-         recognizeShiftUntilLessThan();
+         recognizeShiftUntilLessThan() || recognizeAndInsertStrLen();
 }
 
 /// Check if the given conditional branch is based on the comparison between

>From 65416534f79b1645df771f24ec407a1ba9919aa7 Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Thu, 12 Sep 2024 16:39:07 -0400
Subject: [PATCH 03/13] replace LCSSA with null term ptr

---
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 125 +++++-------------
 1 file changed, 32 insertions(+), 93 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index cbc5ed40947d79..20ecc2d83b2b05 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -1536,57 +1537,6 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
   return nullptr;
 }
 
-/// getCandidateResInstr - If there is strlen calculated, return the Result
-/// instruction based on the \p OpWidth passed, else return nullptr
-static Instruction *getCandidateResInstr(Instruction *EndAddress,
-                                         Value *StartAddress,
-                                         unsigned OpWidth) {
-  using namespace llvm::PatternMatch;
-
-  assert(StartAddress && "Valid start address required.");
-
-  // lambda expression to check that the instruction has a single user
-  auto GetSingleUser = [](Instruction *I) -> User * {
-    if (I->hasOneUse())
-      return *I->user_begin();
-    return nullptr;
-  };
-
-  // The pointer to the end address should only have one use which is a pointer
-  // to int instruction.
-  auto *TmpUser = GetSingleUser(EndAddress);
-  if (!TmpUser)
-    return nullptr;
-
-  if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(TmpUser)) {
-    // The only user of the PtrToIntInst should be the sub instruction that
-    // calculates the difference b/w the two pointer operands.
-    TmpUser = GetSingleUser(PToI);
-    if (!TmpUser)
-      return nullptr;
-    Instruction *Inst = dyn_cast<Instruction>(TmpUser);
-
-    if (!Inst || Inst->getOpcode() != Instruction::Sub ||
-        Inst->getOperand(0) != PToI)
-      return nullptr;
-    Value *MatchAddr;
-    if (match(Inst->getOperand(1), m_PtrToInt(m_Value(MatchAddr)))) {
-      if (MatchAddr != StartAddress)
-        return nullptr;
-
-      // We found the candidate sub instruction
-      switch (OpWidth) {
-      case 8:
-        return Inst;
-      default:
-        return nullptr;
-      }
-    }
-  }
-
-  return nullptr;
-}
-
 /// Recognizes a strlen idiom by checking for loops that increment
 /// a char pointer and then subtract with the base pointer.
 ///
@@ -1595,22 +1545,19 @@ static Instruction *getCandidateResInstr(Instruction *EndAddress,
 ///
 /// The core idiom we are trying to detect is:
 /// \code
-///     if (str == NULL)
-///       goto loop-exit // the precondition of the loop
 ///     start = str;
 ///     do {
 ///       str++;
-///     } while(*str!='\0');
-///     return (str - start);
-/// loop-exit:
+///     } while(*str != '\0');
 /// \endcode
 ///
 /// The transformed output is similar to below c-code:
 /// \code
-///     if (str == NULL)
-///       goto loop-exit // the precondition of the loop
-///     return strlen(str);
+///     str = start + strlen(start)
+///     len = str - start
 /// \endcode
+///
+/// Later the pointer subtraction will be folded by InstCombine
 bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   if (DisableLIRPStrlen)
     return false;
@@ -1620,30 +1567,20 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
     return false;
 
   // It should have a preheader containing nothing but an unconditional branch.
-  auto *Pre = CurLoop->getLoopPreheader();
-  if (!Pre || &Pre->front() != Pre->getTerminator())
+  auto *Preheader = CurLoop->getLoopPreheader();
+  if (!Preheader || &Preheader->front() != Preheader->getTerminator())
     return false;
 
-  auto *EntryBI = dyn_cast<BranchInst>(Pre->getTerminator());
+  auto *EntryBI = dyn_cast<BranchInst>(Preheader->getTerminator());
   if (!EntryBI || EntryBI->isConditional())
     return false;
 
-  // It should have a precondition block
-  auto *PreCondBB = Pre->getSinglePredecessor();
-  if (!PreCondBB)
-    return false;
-
-  // The precondition terminator instruction should skip the loop body based on
-  // an icmp with zero/null.
-  if (!matchCondition(dyn_cast<BranchInst>(PreCondBB->getTerminator()), Pre))
-    return false;
-
   // The loop exit must be conditioned on an icmp with 0.
   // The icmp operand has to be a load on some SSA reg that increments
   // by 1 in the loop.
-  auto *LoopBody = *(CurLoop->block_begin());
-  auto *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
-  auto *LoopCond = matchCondition(LoopTerm, LoopBody);
+  BasicBlock *LoopBody = *CurLoop->block_begin();
+  BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
+  Value *LoopCond = matchCondition(LoopTerm, LoopBody);
 
   if (!LoopCond)
     return false;
@@ -1660,6 +1597,7 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   // the loop, indicating strlen calculation.
   auto *IncPtr = LoopLoad->getPointerOperand();
   const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IncPtr));
+
   if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
     return false;
 
@@ -1700,6 +1638,7 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   if (!LCSSAPhi || !SE->isSCEVable(LCSSAPhi->getType()))
     return false;
 
+  // This matched the pointer version of the idiom
   if (LCSSAPhi->getIncomingValueForBlock(LoopBody) !=
       LoopLoad->getPointerOperand())
     return false;
@@ -1712,35 +1651,34 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
     return false;
 
   // We can now expand the base of the str
-  IRBuilder<> Builder(Pre->getTerminator());
+  IRBuilder<> Builder(Preheader->getTerminator());
 
-  PHINode *LoopPhi = &*LoopBody->phis().begin();
-  if (!LoopPhi || ++LoopBody->phis().begin() != LoopBody->phis().end())
+  auto LoopPhiRange = LoopBody->phis();
+  if (!hasNItems(LoopPhiRange, 1))
     return false;
-  Value *PreVal = LoopBody->phis().begin()->getIncomingValueForBlock(Pre);
+  auto *LoopPhi = &*LoopPhiRange.begin();
+  Value *PreVal = LoopPhi->getIncomingValueForBlock(Preheader);
   if (!PreVal)
     return false;
 
   Value *Expanded = nullptr;
+  Type *ExpandedType = nullptr;
   if (auto *GEP = dyn_cast<GetElementPtrInst>(LoopLoad->getPointerOperand())) {
     if (GEP->getPointerOperand() != LoopPhi)
       return false;
     GetElementPtrInst *NewGEP =
         GetElementPtrInst::Create(GEP->getSourceElementType(), PreVal,
                                   SmallVector<Value *, 4>(GEP->indices()),
-                                  "newgep", Pre->getTerminator());
+                                  "newgep", Preheader->getTerminator());
     Expanded = NewGEP;
-  } else if (LoopLoad->getPointerOperand() == LoopPhi)
+    ExpandedType = NewGEP->getSourceElementType();
+  } else if (LoopLoad->getPointerOperand() == LoopPhi) {
     Expanded = PreVal;
+    ExpandedType = LoopLoad->getType();
+  }
   if (!Expanded)
     return false;
 
-  // Check that the LoopExitBB is calculating the string length and identify
-  // the instruction that has the string length calculation
-  Instruction *ResInst = getCandidateResInstr(LCSSAPhi, PreVal, OpWidth);
-  if (!ResInst)
-    return false;
-
   // Ensure that the GEP has the correct index if the pointer was modified.
   // This can happen when the pointer in the user code, outside the loop,
   // walks past a certain pre-checked index of the string.
@@ -1786,11 +1724,12 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
 
   assert(StrLenFunc && "Failed to emit strlen function.");
 
-  // Replace the subtraction instruction by the result of strlen
-  ResInst->replaceAllUsesWith(StrLenFunc);
-
-  // Remove the loop-exit branch and delete dead instructions
-  RecursivelyDeleteTriviallyDeadInstructions(ResInst, TLI);
+  // Replace LCSSA Phi use with new pointer to the null terminator
+  SmallVector<Value *, 4> NewBaseIndex{StrLenFunc};
+  GetElementPtrInst *NewEndPtr = GetElementPtrInst::Create(
+      ExpandedType, Expanded, NewBaseIndex, "end", Preheader->getTerminator());
+  LCSSAPhi->replaceAllUsesWith(NewEndPtr);
+  RecursivelyDeleteDeadPHINode(LCSSAPhi);
 
   ConstantInt *NewLoopCond = LoopTerm->getSuccessor(0) == LoopBody
                                  ? Builder.getFalse()
@@ -1805,7 +1744,7 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
 
   ORE.emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "recognizeAndInsertStrLen",
-                              CurLoop->getStartLoc(), Pre)
+                              CurLoop->getStartLoc(), Preheader)
            << "Transformed pointer difference into a call to strlen() function";
   });
 

>From b09384485510cd78c3c5332bed2d900f5af7ae08 Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Thu, 12 Sep 2024 18:38:19 -0400
Subject: [PATCH 04/13] update tests

---
 llvm/test/Transforms/LoopIdiom/strlen.ll | 396 +++++++++++++++--------
 1 file changed, 270 insertions(+), 126 deletions(-)

diff --git a/llvm/test/Transforms/LoopIdiom/strlen.ll b/llvm/test/Transforms/LoopIdiom/strlen.ll
index 641fce0da8b785..43ed9d0980bc49 100644
--- a/llvm/test/Transforms/LoopIdiom/strlen.ll
+++ b/llvm/test/Transforms/LoopIdiom/strlen.ll
@@ -1,149 +1,293 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes='loop-idiom' < %s -S | FileCheck %s
 
-target datalayout = "e-m:e-i64:64-n32:64"
-target triple = "powerpc64le-unknown-linux-gnu"
-
-define i64 @valid_strlen_i8_test1(ptr %Str) {
-; CHECK-LABEL: @valid_strlen_i8_test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq ptr [[STR:%.*]], null
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[CLEANUP:%.*]], label [[LOR_LHS_FALSE:%.*]]
-; CHECK:       lor.lhs.false:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[STR]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT:    br i1 [[CMP]], label [[CLEANUP]], label [[FOR_INC_PREHEADER:%.*]]
-; CHECK:       for.inc.preheader:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[STR]], i64 0
-; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[SCEVGEP]])
-; CHECK-NEXT:    br label [[FOR_INC:%.*]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[SRC_09:%.*]] = phi ptr [ poison, [[FOR_INC]] ], [ [[STR]], [[FOR_INC_PREHEADER]] ]
-; CHECK-NEXT:    [[TOBOOL2:%.*]] = icmp eq i8 poison, 0
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_INC]]
-; CHECK:       for.end:
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ [[STRLEN]], [[FOR_END]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[LOR_LHS_FALSE]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL_0]]
+declare void @use(ptr)
+
+define i64 @valid_strlen_1(ptr %0) {
+; CHECK-LABEL: define i64 @valid_strlen_1(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[TMP0]])
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[STRLEN]]
+; CHECK-NEXT:    br label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i8 poison, 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr poison, i64 1
+; CHECK-NEXT:    br i1 true, label %[[BB5:.*]], label %[[BB2]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[DOTLCSSA]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    ret i64 [[TMP14]]
 ;
-entry:
-  %tobool = icmp eq ptr %Str, null
-  br i1 %tobool, label %cleanup, label %lor.lhs.false
-
-lor.lhs.false:                                    ; preds = %entry
-  %0 = load i8, ptr %Str, align 1
-  %cmp = icmp eq i8 %0, 0
-  br i1 %cmp, label %cleanup, label %for.inc
-
-for.inc:                                          ; preds = %lor.lhs.false, %for.inc
-  %Src.09 = phi ptr [ %incdec.ptr, %for.inc ], [ %Str, %lor.lhs.false ]
-  %incdec.ptr = getelementptr inbounds i8, ptr %Src.09, i64 1
-  %.pr = load i8, ptr %incdec.ptr, align 1
-  %tobool2 = icmp eq i8 %.pr, 0
-  br i1 %tobool2, label %for.end, label %for.inc
-
-for.end:                                          ; preds = %for.inc
-  %sub.ptr.lhs.cast = ptrtoint ptr %incdec.ptr to i64
-  %sub.ptr.rhs.cast = ptrtoint ptr %Str to i64
-  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
-  br label %cleanup
+  br label %2
 
-cleanup:                                          ; preds = %lor.lhs.false, %entry, %for.end
-  %retval.0 = phi i64 [ %sub.ptr.sub, %for.end ], [ 0, %entry ], [ 0, %lor.lhs.false ]
-  ret i64 %retval.0
+2:                                                ; preds = %2, %1
+  %3 = phi ptr [ %0, %1 ], [ %6, %2 ]
+  %4 = load i8, ptr %3, align 1
+  %5 = icmp eq i8 %4, 0
+  %6 = getelementptr inbounds i8, ptr %3, i64 1
+  br i1 %5, label %7, label %2
+
+7:                                                ; preds = %2
+  %8 = ptrtoint ptr %3 to i64
+  %9 = ptrtoint ptr %0 to i64
+  %10 = sub i64 %8, %9
+  ret i64 %10
 }
 
-define i64 @valid_strlen_i8_test2(ptr %Str) {
-; CHECK-LABEL: @valid_strlen_i8_test2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq ptr [[STR:%.*]], null
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[CLEANUP:%.*]], label [[FOR_COND_PREHEADER:%.*]]
-; CHECK:       for.cond.preheader:
+
+define i32 @valid_strlen_2(ptr %0) {
+; CHECK-LABEL: define i32 @valid_strlen_2(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB14:.*]], label %[[BB3:.*]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[BB14]], label %[[DOTPREHEADER:.*]]
+; CHECK:       [[_PREHEADER:.*:]]
+; CHECK-NEXT:    [[STR:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
 ; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[STR]])
-; CHECK-NEXT:    br label [[FOR_COND:%.*]]
-; CHECK:       for.cond:
-; CHECK-NEXT:    [[TOBOOL1:%.*]] = icmp eq i8 poison, 0
+; CHECK-NEXT:    [[STR_ADDR_0_LCSSA:%.*]] = getelementptr i8, ptr [[STR]], i64 [[STRLEN]]
+; CHECK-NEXT:    br label %[[BB6:.*]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi ptr [ poison, %[[BB6]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 poison, 0
+; CHECK-NEXT:    br i1 true, label %[[BB9:.*]], label %[[BB6]]
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[STR_ADDR_0_LCSSA]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[SUB_PTR_SUB]] to i32
+; CHECK-NEXT:    br label %[[BB14]]
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi i32 [ [[TMP13]], %[[BB9]] ], [ 0, %[[BB3]] ], [ 0, [[TMP1:%.*]] ]
+; CHECK-NEXT:    ret i32 [[TMP15]]
+;
+  %2 = icmp eq ptr %0, null
+  br i1 %2, label %16, label %3
+
+3:                                                ; preds = %1
+  %4 = load i8, ptr %0, align 1
+  %5 = icmp eq i8 %4, 0
+  br i1 %5, label %16, label %6
+
+6:                                                ; preds = %3, %6
+  %7 = phi ptr [ %8, %6 ], [ %0, %3 ]
+  %8 = getelementptr inbounds i8, ptr %7, i64 1
+  %9 = load i8, ptr %8, align 1
+  %10 = icmp eq i8 %9, 0
+  br i1 %10, label %11, label %6
+
+11:                                               ; preds = %6
+  %12 = ptrtoint ptr %8 to i64
+  %13 = ptrtoint ptr %0 to i64
+  %14 = sub i64 %12, %13
+  %15 = trunc i64 %14 to i32
+  br label %16
+
+16:                                               ; preds = %1, %3, %11
+  %17 = phi i32 [ %15, %11 ], [ 0, %3 ], [ 0, %1 ]
+  ret i32 %17
+}
+
+define i64 @valid_strlen_3(ptr %str) local_unnamed_addr #0 {
+; CHECK-LABEL: define i64 @valid_strlen_3(
+; CHECK-SAME: ptr [[STR:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[_PREHEADER:.*:]]
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[STR]])
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[STR]], i64 [[STRLEN]]
+; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+; CHECK:       [[WHILE_COND]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 poison, 0
 ; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, ptr poison, i64 1
-; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_COND]]
-; CHECK:       for.end:
-; CHECK-NEXT:    br label [[CLEANUP]]
-; CHECK:       cleanup:
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ [[STRLEN]], [[FOR_END]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL_0]]
+; CHECK-NEXT:    br i1 true, label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STR]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP10]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    tail call void @use(ptr [[TMP0]])
+; CHECK-NEXT:    tail call void @use(ptr [[STR]])
+; CHECK-NEXT:    ret i64 [[TMP13]]
 ;
 entry:
-  %tobool = icmp eq ptr %Str, null
-  br i1 %tobool, label %cleanup, label %for.cond
-
-for.cond:                                         ; preds = %entry, %for.cond
-  %Src.0 = phi ptr [ %incdec.ptr, %for.cond ], [ %Str, %entry ]
-  %0 = load i8, ptr %Src.0, align 1
-  %tobool1 = icmp eq i8 %0, 0
-  %incdec.ptr = getelementptr inbounds i8, ptr %Src.0, i64 1
-  br i1 %tobool1, label %for.end, label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %sub.ptr.lhs.cast = ptrtoint ptr %Src.0 to i64
-  %sub.ptr.rhs.cast = ptrtoint ptr %Str to i64
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %str.addr.0 = phi ptr [ %str, %entry ], [ %incdec.ptr, %while.cond ]
+  %0 = load i8, ptr %str.addr.0, align 1
+  %cmp.not = icmp eq i8 %0, 0
+  %incdec.ptr = getelementptr inbounds i8, ptr %str.addr.0, i64 1
+  br i1 %cmp.not, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  %sub.ptr.lhs.cast = ptrtoint ptr %str.addr.0 to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %str to i64
   %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
-  br label %cleanup
+  tail call void @use(ptr %str.addr.0)
+  tail call void @use(ptr %str)
+  ret i64 %sub.ptr.sub
+}
 
-  cleanup:                                          ; preds = %entry, %for.end
-  %retval.0 = phi i64 [ %sub.ptr.sub, %for.end ], [ 0, %entry ]
-  ret i64 %retval.0
+define i64 @valid_strlen_4(ptr %0) {
+; CHECK-LABEL: define i64 @valid_strlen_4(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB10:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK:       [[_PREHEADER:.*:]]
+; CHECK-NEXT:    [[NEWGEP:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[NEWGEP]])
+; CHECK-NEXT:    [[END:%.*]] = getelementptr i8, ptr [[NEWGEP]], i64 [[STRLEN]]
+; CHECK-NEXT:    br label %[[BB3:.*]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi ptr [ poison, %[[BB3]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 poison, 0
+; CHECK-NEXT:    br i1 true, label %[[BB6:.*]], label %[[BB3]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    br label %[[BB10]]
+; CHECK:       [[BB10]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP9]], %[[BB6]] ], [ 0, [[TMP1:%.*]] ]
+; CHECK-NEXT:    ret i64 [[TMP11]]
+;
+  %2 = icmp eq ptr %0, null
+  br i1 %2, label %12, label %3
+
+3:                                                ; preds = %1, %3
+  %4 = phi ptr [ %5, %3 ], [ %0, %1 ]
+  %5 = getelementptr inbounds i8, ptr %4, i64 1
+  %6 = load i8, ptr %5, align 1
+  %7 = icmp eq i8 %6, 0
+  br i1 %7, label %8, label %3
+
+8:                                                ; preds = %3
+  %9 = ptrtoint ptr %5 to i64
+  %10 = ptrtoint ptr %0 to i64
+  %11 = sub i64 %9, %10
+  br label %12
+
+12:                                               ; preds = %1, %8
+  %13 = phi i64 [ %11, %8 ], [ 0, %1 ]
+  ret i64 %13
 }
 
-define void @invalid_strlen_i8_test3(ptr %s, i32 zeroext %i) {
-; CHECK-LABEL: @invalid_strlen_i8_test3(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
-; CHECK:       while.cond:
-; CHECK-NEXT:    [[S_ADDR_0:%.*]] = phi ptr [ [[S:%.*]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR1:%.*]], [[WHILE_COND]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[S_ADDR_0]], align 1
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i8, ptr [[S_ADDR_0]], i64 1
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END:%.*]], label [[WHILE_COND]]
-; CHECK:       while.end:
-; CHECK-NEXT:    [[S_ADDR_0_LCSSA:%.*]] = phi ptr [ [[S_ADDR_0]], [[WHILE_COND]] ]
-; CHECK-NEXT:    [[INCDEC_PTR1_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR1]], [[WHILE_COND]] ]
-; CHECK-NEXT:    store i8 45, ptr [[S_ADDR_0_LCSSA]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I:%.*]], 10
-; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    store i8 65, ptr [[INCDEC_PTR1_LCSSA]], align 1
-; CHECK-NEXT:    br label [[IF_END9:%.*]]
-; CHECK:       if.end:
-; CHECK-NEXT:    store i8 66, ptr [[INCDEC_PTR1_LCSSA]], align 1
-; CHECK-NEXT:    br label [[IF_END9]]
-; CHECK:       if.end9:
-; CHECK-NEXT:    ret void
+define i64 @valid_strlen_use(ptr %str) {
+; CHECK-LABEL: define i64 @valid_strlen_use(
+; CHECK-SAME: ptr [[STR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[STR]])
+; CHECK-NEXT:    [[END:%.*]] = getelementptr i8, ptr [[STR]], i64 [[STRLEN]]
+; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+; CHECK:       [[WHILE_COND]]:
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 poison, 0
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, ptr poison, i64 1
+; CHECK-NEXT:    br i1 true, label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STR]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    tail call void @use(ptr noundef nonnull [[END]])
+; CHECK-NEXT:    tail call void @use(ptr noundef [[STR]])
+; CHECK-NEXT:    ret i64 [[SUB_PTR_SUB]]
 ;
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.cond, %entry
-  %s.addr.0 = phi ptr [ %s, %entry ], [ %incdec.ptr1, %while.cond ]
-  %0 = load i8, ptr %s.addr.0, align 1
-  %tobool.not = icmp eq i8 %0, 0
-  %incdec.ptr1 = getelementptr inbounds i8, ptr %s.addr.0, i64 1
-  br i1 %tobool.not, label %while.end, label %while.cond
+  %str.addr.0 = phi ptr [ %str, %entry ], [ %incdec.ptr, %while.cond ]
+  %0 = load i8, ptr %str.addr.0, align 1
+  %cmp.not = icmp eq i8 %0, 0
+  %incdec.ptr = getelementptr inbounds i8, ptr %str.addr.0, i64 1
+  br i1 %cmp.not, label %while.end, label %while.cond
 
 while.end:                                        ; preds = %while.cond
-  %s.addr.0.lcssa = phi ptr [ %s.addr.0, %while.cond ]
-  %incdec.ptr1.lcssa = phi ptr [ %incdec.ptr1, %while.cond ]
-  store i8 45, ptr %s.addr.0.lcssa, align 1
-  %cmp = icmp ult i32 %i, 10
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:                                          ; preds = %while.end
-  store i8 65, ptr %incdec.ptr1.lcssa, align 1
-  br label %if.end9
-
-if.end:                                           ; preds = %while.end
-  store i8 66, ptr %incdec.ptr1.lcssa, align 1
-  br label %if.end9
-
-if.end9:                                          ; preds = %if.end, %if.then
-  ret void
+  %sub.ptr.lhs.cast = ptrtoint ptr %str.addr.0 to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %str to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  tail call void @use(ptr noundef nonnull %str.addr.0)
+  tail call void @use(ptr noundef %str)
+  ret i64 %sub.ptr.sub
 }
 
+define i64 @invalid_strlen_has_side_effect(ptr %0) {
+; CHECK-LABEL: define i64 @invalid_strlen_has_side_effect(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    br label %[[BB2:.*]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ [[TMP0]], [[TMP1:%.*]] ], [ [[TMP6:%.*]], %[[BB2]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6]] = getelementptr inbounds i8, ptr [[TMP3]], i64 1
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[BB7:.*]], label %[[BB2]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP3]], %[[BB2]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[DOTLCSSA]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    ret i64 [[TMP10]]
+;
+  br label %2
+
+2:                                                ; preds = %2, %1
+  %3 = phi ptr [ %0, %1 ], [ %6, %2 ]
+  %4 = load volatile i8, ptr %3, align 1
+  %5 = icmp eq i8 %4, 0
+  %6 = getelementptr inbounds i8, ptr %3, i64 1
+  br i1 %5, label %7, label %2
+
+7:                                                ; preds = %2
+  %8 = ptrtoint ptr %3 to i64
+  %9 = ptrtoint ptr %0 to i64
+  %10 = sub i64 %8, %9
+  ret i64 %10
+}
+
+define i64 @invalid_strlen_idx_idiom(ptr %0) {
+; CHECK-LABEL: define i64 @invalid_strlen_idx_idiom(
+; CHECK-SAME: ptr [[TMP0:%.*]]) {
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[BB13:.*]], label %[[DOTPREHEADER:.*]]
+; CHECK:       [[_PREHEADER:.*:]]
+; CHECK-NEXT:    br label %[[BB4:.*]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP7:%.*]], %[[BB4]] ], [ 0, %[[DOTPREHEADER]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[TMP8:%.*]], %[[BB4]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
+; CHECK-NEXT:    [[TMP7]] = add nuw nsw i32 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP8]] = getelementptr inbounds i8, ptr [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i8 [[TMP9]], 0
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[BB11:.*]], label %[[BB4]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP7]], %[[BB4]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = zext nneg i32 [[DOTLCSSA]] to i64
+; CHECK-NEXT:    br label %[[BB13]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi i64 [ 0, [[TMP1:%.*]] ], [ [[TMP12]], %[[BB11]] ]
+; CHECK-NEXT:    ret i64 [[TMP14]]
+;
+  %2 = load i8, ptr %0, align 1
+  %3 = icmp eq i8 %2, 0
+  br i1 %3, label %13, label %4
+
+4:                                                ; preds = %1, %4
+  %5 = phi i32 [ %7, %4 ], [ 0, %1 ]
+  %6 = phi ptr [ %8, %4 ], [ %0, %1 ]
+  %7 = add nuw nsw i32 %5, 1
+  %8 = getelementptr inbounds i8, ptr %6, i64 1
+  %9 = load i8, ptr %8, align 1
+  %10 = icmp eq i8 %9, 0
+  br i1 %10, label %11, label %4
+
+11:                                               ; preds = %4
+  %12 = zext nneg i32 %7 to i64
+  br label %13
+
+13:                                               ; preds = %11, %1
+  %14 = phi i64 [ 0, %1 ], [ %12, %11 ]
+  ret i64 %14
+}
+
+

>From 2c2b30ac3cdfa8724986edc47f14dfd12793bd5a Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Fri, 13 Sep 2024 15:12:24 -0400
Subject: [PATCH 05/13] Add wcslen idiom

---
 .../Transforms/Scalar/LoopIdiomRecognize.h    |  3 +
 .../llvm/Transforms/Utils/BuildLibCalls.h     |  6 ++
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 36 +++++++---
 llvm/lib/Transforms/Utils/BuildLibCalls.cpp   |  9 +++
 llvm/test/Transforms/LoopIdiom/wcslen16.ll    | 66 +++++++++++++++++
 llvm/test/Transforms/LoopIdiom/wcslen32.ll    | 70 +++++++++++++++++++
 6 files changed, 181 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopIdiom/wcslen16.ll
 create mode 100644 llvm/test/Transforms/LoopIdiom/wcslen32.ll

diff --git a/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h b/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
index 3a9f016ce9bd60..241a3fc1093607 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
@@ -37,6 +37,9 @@ struct DisableLIRP {
 
   /// When true, Strlen is disabled.
   static bool Strlen;
+
+  /// When true, Wcslen is disabled.
+  static bool Wcslen;
 };
 
 /// Performs Loop Idiom Recognize Pass.
diff --git a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
index a8fb38e7260043..50f695dbe6c076 100644
--- a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -93,6 +93,12 @@ namespace llvm {
   Value *emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL,
                     const TargetLibraryInfo *TLI);
 
+  /// Emit a call to the wcslen function to the builder, for the specified
+  /// pointer. Ptr is required to be some pointer type, and the return value has
+  /// 'size_t' type.
+  Value *emitWcsLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL,
+                    const TargetLibraryInfo *TLI);
+
   /// Emit a call to the strdup function to the builder, for the specified
   /// pointer. Ptr is required to be some pointer type, and the return value has
   /// 'i8*' type.
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 20ecc2d83b2b05..c1af2aa98fc990 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -136,6 +136,14 @@ static cl::opt<bool, true>
                       cl::location(DisableLIRP::Strlen), cl::init(false),
                       cl::ReallyHidden);
 
+bool DisableLIRP::Wcslen;
+static cl::opt<bool, true>
+    DisableLIRPWcslen("disable-" DEBUG_TYPE "-wcslen",
+                      cl::desc("Proceed with loop idiom recognize pass, but do "
+                               "not convert loop(s) to wcslen."),
+                      cl::location(DisableLIRP::Wcslen), cl::init(false),
+                      cl::ReallyHidden);
+
 static cl::opt<bool> UseLIRCodeSizeHeurs(
     "use-lir-code-size-heurs",
     cl::desc("Use loop idiom recognition code size heuristics when compiling"
@@ -1606,15 +1614,19 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   if (!Step)
     return false;
 
-  unsigned int ConstIntValue = 0;
+  unsigned int StepSize = 0;
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Step->getValue()))
-    ConstIntValue = CI->getZExtValue();
+    StepSize = CI->getZExtValue();
 
   unsigned OpWidth = OperandType->getIntegerBitWidth();
-  if (OpWidth != ConstIntValue * 8)
+  unsigned WcharSize = TLI->getWCharSize(*LoopLoad->getModule());
+  if (OpWidth != StepSize * 8)
     return false;
-  if (OpWidth != 8)
+  if (OpWidth != 8 && OpWidth != 16 && OpWidth != 32)
     return false;
+  if (OpWidth >= 16)
+    if (OpWidth != WcharSize * 8 || DisableLIRPWcslen)
+      return false;
 
   // Scan every instruction in the loop to ensure there are no side effects.
   for (auto &I : *LoopBody)
@@ -1666,12 +1678,11 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   if (auto *GEP = dyn_cast<GetElementPtrInst>(LoopLoad->getPointerOperand())) {
     if (GEP->getPointerOperand() != LoopPhi)
       return false;
-    GetElementPtrInst *NewGEP =
-        GetElementPtrInst::Create(GEP->getSourceElementType(), PreVal,
-                                  SmallVector<Value *, 4>(GEP->indices()),
-                                  "newgep", Preheader->getTerminator());
+    GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+        LoopLoad->getType(), PreVal, SmallVector<Value *, 4>(GEP->indices()),
+        "newgep", Preheader->getTerminator());
     Expanded = NewGEP;
-    ExpandedType = NewGEP->getSourceElementType();
+    ExpandedType = LoopLoad->getType();
   } else if (LoopLoad->getPointerOperand() == LoopPhi) {
     Expanded = PreVal;
     ExpandedType = LoopLoad->getType();
@@ -1718,8 +1729,15 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   Value *StrLenFunc = nullptr;
   switch (OpWidth) {
   case 8:
+    if (!TLI->has(LibFunc_strlen))
+      return false;
     StrLenFunc = emitStrLen(Expanded, Builder, *DL, TLI);
     break;
+  case 16:
+  case 32:
+    if (!TLI->has(LibFunc_wcslen))
+      return false;
+    StrLenFunc = emitWcsLen(Expanded, Builder, *DL, TLI);
   }
 
   assert(StrLenFunc && "Failed to emit strlen function.");
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index e039457f313b29..cfda42dd7f6556 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -1536,6 +1536,15 @@ Value *llvm::emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL,
   return emitLibCall(LibFunc_strlen, SizeTTy, CharPtrTy, Ptr, B, TLI);
 }
 
+Value *llvm::emitWcsLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL,
+                        const TargetLibraryInfo *TLI) {
+  assert(Ptr && Ptr->getType()->isPointerTy() &&
+         "Argument to wcslen intrinsic must be a pointer.");
+  Type *PtrTy = B.getPtrTy();
+  Type *SizeTTy = getSizeTTy(B, TLI);
+  return emitLibCall(LibFunc_wcslen, SizeTTy, PtrTy, Ptr, B, TLI);
+}
+
 Value *llvm::emitStrDup(Value *Ptr, IRBuilderBase &B,
                         const TargetLibraryInfo *TLI) {
   Type *CharPtrTy = B.getPtrTy();
diff --git a/llvm/test/Transforms/LoopIdiom/wcslen16.ll b/llvm/test/Transforms/LoopIdiom/wcslen16.ll
new file mode 100644
index 00000000000000..6c140ddf90d4e3
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/wcslen16.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='loop-idiom' < %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i64 @valid_strlen16(ptr %src) {
+; CHECK-LABEL: define i64 @valid_strlen16(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[SRC]], null
+; CHECK-NEXT:    br i1 [[CMP]], label %[[RETURN:.*]], label %[[LOR_LHS_FALSE:.*]]
+; CHECK:       [[LOR_LHS_FALSE]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i16 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[RETURN]], label %[[WHILE_COND_PREHEADER:.*]]
+; CHECK:       [[WHILE_COND_PREHEADER]]:
+; CHECK-NEXT:    [[NEWGEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 -1
+; CHECK-NEXT:    [[WCSLEN:%.*]] = call i64 @wcslen(ptr [[NEWGEP]])
+; CHECK-NEXT:    [[END:%.*]] = getelementptr i16, ptr [[NEWGEP]], i64 [[WCSLEN]]
+; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+; CHECK:       [[WHILE_COND]]:
+; CHECK-NEXT:    [[SRC_PN:%.*]] = phi ptr [ poison, %[[WHILE_COND]] ], [ [[SRC]], %[[WHILE_COND_PREHEADER]] ]
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i16 poison, 0
+; CHECK-NEXT:    br i1 true, label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    [[SUB_PTR_DIV:%.*]] = ashr exact i64 [[SUB_PTR_SUB]], 1
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ [[SUB_PTR_DIV]], %[[WHILE_END]] ], [ 0, %[[LOR_LHS_FALSE]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL_0]]
+;
+entry:
+  %cmp = icmp eq ptr %src, null
+  br i1 %cmp, label %return, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %0 = load i16, ptr %src, align 2
+  %cmp1 = icmp eq i16 %0, 0
+  br i1 %cmp1, label %return, label %while.cond
+
+while.cond:                                       ; preds = %lor.lhs.false, %while.cond
+  %src.pn = phi ptr [ %curr.0, %while.cond ], [ %src, %lor.lhs.false ]
+  %curr.0 = getelementptr inbounds i8, ptr %src.pn, i64 2
+  %1 = load i16, ptr %curr.0, align 2
+  %tobool.not = icmp eq i16 %1, 0
+  br i1 %tobool.not, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  %sub.ptr.lhs.cast = ptrtoint ptr %curr.0 to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %src to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %sub.ptr.div = ashr exact i64 %sub.ptr.sub, 1
+  br label %return
+
+return:                                           ; preds = %entry, %lor.lhs.false, %while.end
+  %retval.0 = phi i64 [ %sub.ptr.div, %while.end ], [ 0, %lor.lhs.false ], [ 0, %entry ]
+  ret i64 %retval.0
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"wchar_size", i32 2}
+
diff --git a/llvm/test/Transforms/LoopIdiom/wcslen32.ll b/llvm/test/Transforms/LoopIdiom/wcslen32.ll
new file mode 100644
index 00000000000000..fad4c52078967f
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/wcslen32.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='loop-idiom' < %s -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i64 @valid_wcslen32(ptr %src) {
+; CHECK-LABEL: define i64 @valid_wcslen32(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[SRC]], null
+; CHECK-NEXT:    br i1 [[CMP]], label %[[RETURN:.*]], label %[[LOR_LHS_FALSE:.*]]
+; CHECK:       [[LOR_LHS_FALSE]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[RETURN]], label %[[WHILE_COND_PREHEADER:.*]]
+; CHECK:       [[WHILE_COND_PREHEADER]]:
+; CHECK-NEXT:    [[NEWGEP:%.*]] = getelementptr i32, ptr [[SRC]], i64 -3
+; CHECK-NEXT:    [[WCSLEN:%.*]] = call i64 @wcslen(ptr [[NEWGEP]])
+; CHECK-NEXT:    [[END:%.*]] = getelementptr i32, ptr [[NEWGEP]], i64 [[WCSLEN]]
+; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+; CHECK:       [[WHILE_COND]]:
+; CHECK-NEXT:    [[SRC_PN:%.*]] = phi ptr [ poison, %[[WHILE_COND]] ], [ [[SRC]], %[[WHILE_COND_PREHEADER]] ]
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 poison, 0
+; CHECK-NEXT:    br i1 true, label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    [[SUB_PTR_DIV:%.*]] = ashr exact i64 [[SUB_PTR_SUB]], 2
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ [[SUB_PTR_DIV]], %[[WHILE_END]] ], [ 0, %[[LOR_LHS_FALSE]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL_0]]
+;
+entry:
+  %cmp = icmp eq ptr %src, null
+  br i1 %cmp, label %return, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %0 = load i32, ptr %src, align 4
+  %cmp1 = icmp eq i32 %0, 0
+  br i1 %cmp1, label %return, label %while.cond.preheader
+
+while.cond.preheader:                             ; preds = %lor.lhs.false
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond.preheader, %while.cond
+  %src.pn = phi ptr [ %curr.0, %while.cond ], [ %src, %while.cond.preheader ]
+  %curr.0 = getelementptr inbounds i8, ptr %src.pn, i64 4
+  %1 = load i32, ptr %curr.0, align 4
+  %tobool.not = icmp eq i32 %1, 0
+  br i1 %tobool.not, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  %curr.0.lcssa = phi ptr [ %curr.0, %while.cond ]
+  %sub.ptr.lhs.cast = ptrtoint ptr %curr.0.lcssa to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %src to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %sub.ptr.div = ashr exact i64 %sub.ptr.sub, 2
+  br label %return
+
+return:                                           ; preds = %entry, %lor.lhs.false, %while.end
+  %retval.0 = phi i64 [ %sub.ptr.div, %while.end ], [ 0, %lor.lhs.false ], [ 0, %entry ]
+  ret i64 %retval.0
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"wchar_size", i32 4}
+

>From 117a0d08c8bab0216e4aab07de23bd610866a55a Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Sun, 24 Nov 2024 01:58:13 -0500
Subject: [PATCH 06/13] refactor with SCEV Expander

---
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 330 +++++++++---------
 1 file changed, 173 insertions(+), 157 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index c1af2aa98fc990..1b5c29f458bf3a 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -253,7 +253,7 @@ class LoopIdiomRecognize {
   bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
                              Instruction *DefX, PHINode *CntPhi,
                              Instruction *CntInst);
-  bool recognizeAndInsertFFS();  /// Find First Set: ctlz or cttz
+  bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
   bool recognizeShiftUntilLessThan();
   void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
                                 Instruction *CntInst, PHINode *CntPhi,
@@ -621,7 +621,8 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
     const SCEVAddRecExpr *FirstStoreEv =
         cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
     APInt FirstStride = getStoreStride(FirstStoreEv);
-    unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
+    unsigned FirstStoreSize =
+        DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
 
     // See if we can optimize just this store in isolation.
     if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
@@ -1112,13 +1113,14 @@ bool LoopIdiomRecognize::processLoopStridedStore(
         BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
         /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
   } else {
-    assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
+    assert(isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
     // Everything is emitted in default address space
     Type *Int8PtrTy = DestInt8PtrTy;
 
     StringRef FuncName = "memset_pattern16";
-    FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
-                            Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
+    FunctionCallee MSP =
+        getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
+                           Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
     inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
@@ -1160,8 +1162,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     R << "Transformed loop-strided store in "
       << ore::NV("Function", TheStore->getFunction())
       << " function into a call to "
-      << ore::NV("NewFunction", NewCall->getCalledFunction())
-      << "() intrinsic";
+      << ore::NV("NewFunction", NewCall->getCalledFunction()) << "() intrinsic";
     if (!Stores.empty())
       R << ore::setExtraArgs();
     for (auto *I : Stores) {
@@ -1462,8 +1463,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
            << ore::NV("NewFunction", NewCall->getCalledFunction())
            << "() intrinsic from " << ore::NV("Inst", InstRemark)
            << " instruction in " << ore::NV("Function", TheStore->getFunction())
-           << " function"
-           << ore::setExtraArgs()
+           << " function" << ore::setExtraArgs()
            << ore::NV("FromBlock", TheStore->getParent()->getName())
            << ore::NV("ToBlock", Preheader->getName());
   });
@@ -1545,51 +1545,47 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
   return nullptr;
 }
 
-/// Recognizes a strlen idiom by checking for loops that increment
-/// a char pointer and then subtract with the base pointer.
-///
-/// If detected, transforms the relevant code to a strlen function
-/// call, and returns true; otherwise, returns false.
-///
-/// The core idiom we are trying to detect is:
-/// \code
-///     start = str;
-///     do {
-///       str++;
-///     } while(*str != '\0');
-/// \endcode
-///
-/// The transformed output is similar to below c-code:
-/// \code
-///     str = start + strlen(start)
-///     len = str - start
-/// \endcode
-///
-/// Later the pointer subtraction will be folded by InstCombine
-bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
-  if (DisableLIRPStrlen)
-    return false;
+struct StrlenIdiom {
+  unsigned IdiomSize;
+  ConstantInt *StepSize;
+  const SCEV *LoadBaseEv;
+  Type *LoadType;
+};
+
+/// Trying to detect strlen idiom that increments a char pointer
+/// with a single loop body bb.
+static bool detectStrLenIdiom(const Loop *CurLoop, ScalarEvolution *SE,
+                              const TargetLibraryInfo *TLI,
+                              StrlenIdiom &Idiom) {
+
+  outs() << "current loop:\n";
+  CurLoop->print(outs());
+  outs() << "\n";
 
   // Give up if the loop has multiple blocks or multiple backedges.
   if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
     return false;
 
-  // It should have a preheader containing nothing but an unconditional branch.
-  auto *Preheader = CurLoop->getLoopPreheader();
-  if (!Preheader || &Preheader->front() != Preheader->getTerminator())
+  // It should have a preheader and a branch instruction.
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  if (!Preheader)
     return false;
 
-  auto *EntryBI = dyn_cast<BranchInst>(Preheader->getTerminator());
-  if (!EntryBI || EntryBI->isConditional())
+  BranchInst *EntryBI = dyn_cast<BranchInst>(Preheader->getTerminator());
+  if (!EntryBI)
     return false;
 
-  // The loop exit must be conditioned on an icmp with 0.
+  // The loop exit must be conditioned on an icmp with 0 the null terminator.
   // The icmp operand has to be a load on some SSA reg that increments
   // by 1 in the loop.
   BasicBlock *LoopBody = *CurLoop->block_begin();
+
+  // Skip if the body is too big as it most likely is not a strlen idiom.
+  if (!LoopBody || LoopBody->size() >= 10)
+    return false;
+
   BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
   Value *LoopCond = matchCondition(LoopTerm, LoopBody);
-
   if (!LoopCond)
     return false;
 
@@ -1601,23 +1597,29 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   if (!OperandType || !OperandType->isIntegerTy())
     return false;
 
-  // See if the pointer expression is an AddRec with step 1 ({n,+,1}) on
-  // the loop, indicating strlen calculation.
+  // See if the pointer expression is an AddRec with constant step a of form
+  // ({n,+,a}) where a is the width of the char type.
   auto *IncPtr = LoopLoad->getPointerOperand();
   const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IncPtr));
-
   if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
     return false;
 
+  outs() << "pointer load ev: ";
+  LoadEv->print(outs());
+  outs() << "\n";
+
   const SCEVConstant *Step =
       dyn_cast<SCEVConstant>(LoadEv->getStepRecurrence(*SE));
   if (!Step)
     return false;
 
-  unsigned int StepSize = 0;
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(Step->getValue()))
-    StepSize = CI->getZExtValue();
+  unsigned StepSize = 0;
+  ConstantInt *StepSizeCI = dyn_cast<ConstantInt>(Step->getValue());
+  if (!StepSizeCI)
+    return false;
+  StepSize = StepSizeCI->getZExtValue();
 
+  // Verify that StepSize is consistent with platform char width.
   unsigned OpWidth = OperandType->getIntegerBitWidth();
   unsigned WcharSize = TLI->getWCharSize(*LoopLoad->getModule());
   if (OpWidth != StepSize * 8)
@@ -1625,7 +1627,7 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   if (OpWidth != 8 && OpWidth != 16 && OpWidth != 32)
     return false;
   if (OpWidth >= 16)
-    if (OpWidth != WcharSize * 8 || DisableLIRPWcslen)
+    if (OpWidth != WcharSize * 8)
       return false;
 
   // Scan every instruction in the loop to ensure there are no side effects.
@@ -1637,137 +1639,152 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   if (!LoopExitBB)
     return false;
 
-  // Check that the loop exit block is valid:
-  // It needs to have exactly one LCSSA Phi which is an AddRec.
-  PHINode *LCSSAPhi = nullptr;
   for (PHINode &PN : LoopExitBB->phis()) {
-    if (!LCSSAPhi && PN.getNumIncomingValues() == 1)
-      LCSSAPhi = &PN;
-    else
+    const SCEV *Ev = SE->getSCEV(&PN);
+    outs() << "loop exit block scev exprs: ";
+    PN.print(outs());
+    if (Ev)
+      Ev->print(outs());
+    outs() << "\n";
+
+    if (!Ev)
       return false;
-  }
 
-  if (!LCSSAPhi || !SE->isSCEVable(LCSSAPhi->getType()))
-    return false;
-
-  // This matched the pointer version of the idiom
-  if (LCSSAPhi->getIncomingValueForBlock(LoopBody) !=
-      LoopLoad->getPointerOperand())
-    return false;
+    // Since we verified that the loop trip count will be a valid strlen idiom,
+    // we can expand all lcssa phi with {n,+,1} as (n + strlen) and use
+    // SCEVExpander materialize the loop output.
+    const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
+    if (!AddRecEv || !AddRecEv->isAffine())
+      return false;
 
-  const SCEVAddRecExpr *LCSSAEv =
-      dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LCSSAPhi->getIncomingValue(0)));
+    // We only want RecAddExpr with recurrence step that are constant. This
+    // is good enough for all the idioms we want to recognize. Later we expand
+    // the recurrence as {base,+,a} -> (base + a * strlen) and materialize
+    if (!dyn_cast<SCEVConstant>(AddRecEv->getStepRecurrence(*SE)))
+      return false;
+  }
 
-  if (!LCSSAEv || !dyn_cast<SCEVUnknown>(SE->getPointerBase(LCSSAEv)) ||
-      !LCSSAEv->isAffine())
-    return false;
+  Idiom.LoadBaseEv = LoadEv->getStart();
+  Idiom.IdiomSize = OpWidth;
+  Idiom.StepSize = StepSizeCI;
+  Idiom.LoadType = OperandType;
+  return true;
+}
 
-  // We can now expand the base of the str
-  IRBuilder<> Builder(Preheader->getTerminator());
+/// Recognizes a strlen idiom by checking for loops that increment
+/// a char pointer and then subtract with the base pointer.
+///
+/// If detected, transforms the relevant code to a strlen function
+/// call, and returns true; otherwise, returns false.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///     start = str;
+///     do {
+///       str++;
+///     } while(*str != '\0');
+/// \endcode
+///
+/// The transformed output is similar to below c-code:
+/// \code
+///     str = start + strlen(start)
+///     len = str - start
+/// \endcode
+///
+/// Later the pointer subtraction will be folded by InstCombine
+bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
+  /*
+  const auto *First = CurLoop->block_begin();
+  if (First != CurLoop->block_end()) {
+    auto *F = (*First)->getParent();
+    outs() << "\n\n\n\n\n========== NEW LOOP ============\n";
+    F->print(outs());
+  }
+  */
 
-  auto LoopPhiRange = LoopBody->phis();
-  if (!hasNItems(LoopPhiRange, 1))
-    return false;
-  auto *LoopPhi = &*LoopPhiRange.begin();
-  Value *PreVal = LoopPhi->getIncomingValueForBlock(Preheader);
-  if (!PreVal)
+  // TODO: check for disable options
+  StrlenIdiom Idiom;
+  if (!detectStrLenIdiom(CurLoop, SE, TLI, Idiom))
     return false;
 
-  Value *Expanded = nullptr;
-  Type *ExpandedType = nullptr;
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(LoopLoad->getPointerOperand())) {
-    if (GEP->getPointerOperand() != LoopPhi)
-      return false;
-    GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
-        LoopLoad->getType(), PreVal, SmallVector<Value *, 4>(GEP->indices()),
-        "newgep", Preheader->getTerminator());
-    Expanded = NewGEP;
-    ExpandedType = LoopLoad->getType();
-  } else if (LoopLoad->getPointerOperand() == LoopPhi) {
-    Expanded = PreVal;
-    ExpandedType = LoopLoad->getType();
-  }
-  if (!Expanded)
-    return false;
-
-  // Ensure that the GEP has the correct index if the pointer was modified.
-  // This can happen when the pointer in the user code, outside the loop,
-  // walks past a certain pre-checked index of the string.
-  if (auto *GEP = dyn_cast<GEPOperator>(Expanded)) {
-    if (GEP->getNumOperands() != 2)
-      return false;
-
-    ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
-    if (!I0)
-      return false;
-
-    int64_t Index = I0->getSExtValue(); // GEP index
-    auto *SAdd = dyn_cast<SCEVAddExpr>(LoadEv->getStart());
-    if (!SAdd || SAdd->getNumOperands() != 2)
-      return false;
+  // outs() << "idiom is good\n\n";
 
-    auto *SAdd0 = dyn_cast<SCEVConstant>(SAdd->getOperand(0));
-    if (!SAdd0)
-      return false;
-
-    ConstantInt *CInt = SAdd0->getValue(); // SCEV index
-    assert(CInt && "Expecting CInt to be valid.");
-    int64_t Offset = CInt->getSExtValue();
+  BasicBlock *Preheader = CurLoop->getLoopPreheader();
+  BasicBlock *LoopExitBB = CurLoop->getExitBlock();
 
-    // Update the index based on the Offset
-    assert((Offset * 8) % GEP->getSourceElementType()->getIntegerBitWidth() ==
-               0 &&
-           "Invalid offset");
-    int64_t NewIndex =
-        (Offset * 8) / GEP->getSourceElementType()->getIntegerBitWidth() -
-        Index;
-    Value *NewIndexVal =
-        ConstantInt::get(GEP->getOperand(1)->getType(), NewIndex);
-    GEP->setOperand(1, NewIndexVal);
-  }
+  IRBuilder<> Builder(Preheader->getTerminator());
+  SCEVExpander Expander(*SE, Preheader->getModule()->getDataLayout(), "scev");
+  Value *MaterialzedBase = Expander.expandCodeFor(
+      Idiom.LoadBaseEv, Idiom.LoadBaseEv->getType(), Builder.GetInsertPoint());
 
   Value *StrLenFunc = nullptr;
-  switch (OpWidth) {
+  switch (Idiom.IdiomSize) {
   case 8:
-    if (!TLI->has(LibFunc_strlen))
+    if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_strlen))
       return false;
-    StrLenFunc = emitStrLen(Expanded, Builder, *DL, TLI);
+    StrLenFunc = emitStrLen(MaterialzedBase, Builder, *DL, TLI);
     break;
   case 16:
   case 32:
-    if (!TLI->has(LibFunc_wcslen))
+    if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_wcslen))
       return false;
-    StrLenFunc = emitWcsLen(Expanded, Builder, *DL, TLI);
+    StrLenFunc = emitWcsLen(MaterialzedBase, Builder, *DL, TLI);
   }
-
   assert(StrLenFunc && "Failed to emit strlen function.");
 
-  // Replace LCSSA Phi use with new pointer to the null terminator
-  SmallVector<Value *, 4> NewBaseIndex{StrLenFunc};
-  GetElementPtrInst *NewEndPtr = GetElementPtrInst::Create(
-      ExpandedType, Expanded, NewBaseIndex, "end", Preheader->getTerminator());
-  LCSSAPhi->replaceAllUsesWith(NewEndPtr);
-  RecursivelyDeleteDeadPHINode(LCSSAPhi);
-
-  ConstantInt *NewLoopCond = LoopTerm->getSuccessor(0) == LoopBody
-                                 ? Builder.getFalse()
-                                 : Builder.getTrue();
-  LoopTerm->setCondition(NewLoopCond);
-
-  deleteDeadInstruction(cast<Instruction>(LoopCond));
-  deleteDeadInstruction(cast<Instruction>(IncPtr));
+  const SCEV *StrlenEv = SE->getSCEV(StrLenFunc);
+  SmallVector<PHINode *, 4> Cleanup;
+  for (PHINode &PN : LoopExitBB->phis()) {
+    const SCEV *Ev = SE->getSCEV(&PN);
+    const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
+    const SCEVConstant *Step =
+        dyn_cast<SCEVConstant>(AddRecEv->getStepRecurrence(*SE));
+    const SCEV *Base = AddRecEv->getStart();
+
+    /*
+    outs() << "creating new mult scev: ";
+    Base->getType()->print(outs());
+    outs() << " ";
+    Step->getType()->print(outs());
+    outs() << " ";
+    StrlenEv->getType()->print(outs());
+    outs() << "\n";
+    */
+
+    // It is safe to truncate to base since if base is narrower than size_t
+    // the equivalent user code will have to truncate anyways.
+    const SCEV *NewEv = SE->getAddExpr(
+        Base, SE->getMulExpr(Step, SE->getTruncateOrSignExtend(
+                                       StrlenEv, Base->getType())));
+
+    /*
+    outs() << "new ev exprs: ";
+    PN.print(outs());
+    if (NewEv)
+      NewEv->print(outs());
+    outs() << "\n";
+    */
+
+    Expander.clear();
+    Value *MaterializedPHI = Expander.expandCodeFor(NewEv, NewEv->getType(),
+                                                    Builder.GetInsertPoint());
+    PN.replaceAllUsesWith(MaterializedPHI);
+    Cleanup.push_back(&PN);
+  }
+
+  for (PHINode *PN : Cleanup) {
+    RecursivelyDeleteDeadPHINode(PN);
+  }
   SE->forgetLoop(CurLoop);
 
-  LLVM_DEBUG(dbgs() << "  Formed strlen: " << *StrLenFunc << "\n");
-
+  ++NumStrLen;
+  LLVM_DEBUG(dbgs() << "  Formed strlen idiom: " << *StrLenFunc << "\n");
   ORE.emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "recognizeAndInsertStrLen",
                               CurLoop->getStartLoc(), Preheader)
-           << "Transformed pointer difference into a call to strlen() function";
+           << "Transformed strlen loop idiom";
   });
 
-  ++NumStrLen;
-
   return true;
 }
 
@@ -1978,8 +1995,7 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
     ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
     if (!Dec ||
         !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
-          (SubOneOp->getOpcode() == Instruction::Add &&
-           Dec->isMinusOne()))) {
+          (SubOneOp->getOpcode() == Instruction::Add && Dec->isMinusOne()))) {
       return false;
     }
   }
@@ -2090,8 +2106,8 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
   // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
   if (!DefX || !DefX->isShift())
     return false;
-  IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
-                                                     Intrinsic::ctlz;
+  IntrinID =
+      DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz : Intrinsic::ctlz;
   ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
   if (!Shft || !Shft->isOne())
     return false;
@@ -2594,9 +2610,8 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
     TcPhi->insertBefore(Body->begin());
 
     Builder.SetInsertPoint(LbCond);
-    Instruction *TcDec = cast<Instruction>(
-        Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
-                          "tcdec", false, true));
+    Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
+        TcPhi, ConstantInt::get(Ty, 1), "tcdec", false, true));
 
     TcPhi->addIncoming(TripCnt, PreHead);
     TcPhi->addIncoming(TcDec, Body);
@@ -3226,7 +3241,8 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
   // intrinsic we'll use are not cheap. Note that we are okay with *just*
   // making the loop countable, even if nothing else changes.
   IntrinsicCostAttributes Attrs(
-      IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
+      IntrID, Ty,
+      {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
   InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
   if (Cost > TargetTransformInfo::TCC_Basic) {
     LLVM_DEBUG(dbgs() << DEBUG_TYPE

>From f9c5735d45657f9e484aab8ac2ea55e941541f91 Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Sun, 24 Nov 2024 15:55:14 -0500
Subject: [PATCH 07/13] Add more tests

---
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  |  23 +-
 llvm/test/Transforms/LoopIdiom/strlen.ll      | 611 +++++++++++-------
 2 files changed, 388 insertions(+), 246 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 1b5c29f458bf3a..8a898bc7bd5a15 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -20,8 +20,7 @@
 //
 // TODO List:
 //
-// Future loop memory idioms to recognize:
-//   memcmp, strlen, etc.
+// Future loop memory idioms to recognize: memcmp, etc.
 //
 // This could recognize common matrix multiplies and dot product idioms and
 // replace them with calls to BLAS (if linked in??).
@@ -1557,13 +1556,16 @@ struct StrlenIdiom {
 static bool detectStrLenIdiom(const Loop *CurLoop, ScalarEvolution *SE,
                               const TargetLibraryInfo *TLI,
                               StrlenIdiom &Idiom) {
-
+  /*
   outs() << "current loop:\n";
   CurLoop->print(outs());
   outs() << "\n";
+  */
 
-  // Give up if the loop has multiple blocks or multiple backedges.
-  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+  // Give up if the loop has multiple blocks, multiple backedges, or
+  // multiple exit blocks
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1 ||
+      !CurLoop->getUniqueExitBlock())
     return false;
 
   // It should have a preheader and a branch instruction.
@@ -1581,7 +1583,7 @@ static bool detectStrLenIdiom(const Loop *CurLoop, ScalarEvolution *SE,
   BasicBlock *LoopBody = *CurLoop->block_begin();
 
   // Skip if the body is too big as it most likely is not a strlen idiom.
-  if (!LoopBody || LoopBody->size() >= 10)
+  if (!LoopBody || LoopBody->size() >= 15)
     return false;
 
   BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
@@ -1604,9 +1606,11 @@ static bool detectStrLenIdiom(const Loop *CurLoop, ScalarEvolution *SE,
   if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
     return false;
 
+  /*
   outs() << "pointer load ev: ";
   LoadEv->print(outs());
   outs() << "\n";
+  */
 
   const SCEVConstant *Step =
       dyn_cast<SCEVConstant>(LoadEv->getStepRecurrence(*SE));
@@ -1640,12 +1644,17 @@ static bool detectStrLenIdiom(const Loop *CurLoop, ScalarEvolution *SE,
     return false;
 
   for (PHINode &PN : LoopExitBB->phis()) {
+    if (!SE->isSCEVable(PN.getType()))
+      return false;
+
     const SCEV *Ev = SE->getSCEV(&PN);
+    /*
     outs() << "loop exit block scev exprs: ";
     PN.print(outs());
     if (Ev)
       Ev->print(outs());
     outs() << "\n";
+    */
 
     if (!Ev)
       return false;
@@ -1772,6 +1781,8 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
     Cleanup.push_back(&PN);
   }
 
+  // All LCSSA Loop Phi are dead, the left over loop body can be cleaned up by
+  // later passes
   for (PHINode *PN : Cleanup) {
     RecursivelyDeleteDeadPHINode(PN);
   }
diff --git a/llvm/test/Transforms/LoopIdiom/strlen.ll b/llvm/test/Transforms/LoopIdiom/strlen.ll
index 43ed9d0980bc49..0dc833ec0e35f3 100644
--- a/llvm/test/Transforms/LoopIdiom/strlen.ll
+++ b/llvm/test/Transforms/LoopIdiom/strlen.ll
@@ -1,293 +1,424 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='loop-idiom' < %s -S | FileCheck %s
+; RUN: opt -passes='loop(loop-idiom),verify' < %s -S | FileCheck %s
 
+declare void @other()
 declare void @use(ptr)
+declare void @usei(i32)
+declare void @usel(i64)
 
-define i64 @valid_strlen_1(ptr %0) {
-; CHECK-LABEL: define i64 @valid_strlen_1(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[TMP0]])
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[STRLEN]]
-; CHECK-NEXT:    br label %[[BB2:.*]]
-; CHECK:       [[BB2]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i8 poison, 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr poison, i64 1
-; CHECK-NEXT:    br i1 true, label %[[BB5:.*]], label %[[BB2]]
-; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[DOTLCSSA]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = ptrtoint ptr [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    ret i64 [[TMP14]]
-;
-  br label %2
-
-2:                                                ; preds = %2, %1
-  %3 = phi ptr [ %0, %1 ], [ %6, %2 ]
-  %4 = load i8, ptr %3, align 1
-  %5 = icmp eq i8 %4, 0
-  %6 = getelementptr inbounds i8, ptr %3, i64 1
-  br i1 %5, label %7, label %2
-
-7:                                                ; preds = %2
-  %8 = ptrtoint ptr %3 to i64
-  %9 = ptrtoint ptr %0 to i64
-  %10 = sub i64 %8, %9
-  ret i64 %10
-}
-
-
-define i32 @valid_strlen_2(ptr %0) {
-; CHECK-LABEL: define i32 @valid_strlen_2(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB14:.*]], label %[[BB3:.*]]
-; CHECK:       [[BB3]]:
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP0]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[BB14]], label %[[DOTPREHEADER:.*]]
-; CHECK:       [[_PREHEADER:.*:]]
-; CHECK-NEXT:    [[STR:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
-; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[STR]])
-; CHECK-NEXT:    [[STR_ADDR_0_LCSSA:%.*]] = getelementptr i8, ptr [[STR]], i64 [[STRLEN]]
-; CHECK-NEXT:    br label %[[BB6:.*]]
-; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = phi ptr [ poison, %[[BB6]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 poison, 0
-; CHECK-NEXT:    br i1 true, label %[[BB9:.*]], label %[[BB6]]
-; CHECK:       [[BB9]]:
-; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[STR_ADDR_0_LCSSA]] to i64
-; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[TMP0]] to i64
-; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc i64 [[SUB_PTR_SUB]] to i32
-; CHECK-NEXT:    br label %[[BB14]]
-; CHECK:       [[BB14]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi i32 [ [[TMP13]], %[[BB9]] ], [ 0, %[[BB3]] ], [ 0, [[TMP1:%.*]] ]
-; CHECK-NEXT:    ret i32 [[TMP15]]
-;
-  %2 = icmp eq ptr %0, null
-  br i1 %2, label %16, label %3
-
-3:                                                ; preds = %1
-  %4 = load i8, ptr %0, align 1
-  %5 = icmp eq i8 %4, 0
-  br i1 %5, label %16, label %6
-
-6:                                                ; preds = %3, %6
-  %7 = phi ptr [ %8, %6 ], [ %0, %3 ]
-  %8 = getelementptr inbounds i8, ptr %7, i64 1
-  %9 = load i8, ptr %8, align 1
-  %10 = icmp eq i8 %9, 0
-  br i1 %10, label %11, label %6
-
-11:                                               ; preds = %6
-  %12 = ptrtoint ptr %8 to i64
-  %13 = ptrtoint ptr %0 to i64
-  %14 = sub i64 %12, %13
-  %15 = trunc i64 %14 to i32
-  br label %16
-
-16:                                               ; preds = %1, %3, %11
-  %17 = phi i32 [ %15, %11 ], [ 0, %3 ], [ 0, %1 ]
-  ret i32 %17
-}
-
-define i64 @valid_strlen_3(ptr %str) local_unnamed_addr #0 {
-; CHECK-LABEL: define i64 @valid_strlen_3(
-; CHECK-SAME: ptr [[STR:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:  [[_PREHEADER:.*:]]
+; size_t basic_strlen(const char* str) {
+;     while (*str != '\0') {
+;         ++str;
+;     }
+;     return str - base;
+; }
+define i64 @valid_basic_strlen(ptr %str) {
+; CHECK-LABEL: define i64 @valid_basic_strlen(
+; CHECK-SAME: ptr [[STR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[STR]])
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[STR]], i64 [[STRLEN]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[STR]], i64 [[STRLEN]]
 ; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
 ; CHECK:       [[WHILE_COND]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 poison, 0
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, ptr poison, i64 1
-; CHECK-NEXT:    br i1 true, label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK-NEXT:    [[STR_ADDR_0:%.*]] = phi ptr [ [[STR]], %[[ENTRY]] ], [ [[INCDEC_PTR:%.*]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[STR_ADDR_0]], align 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr i8, ptr [[STR_ADDR_0]], i64 1
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_COND]]
 ; CHECK:       [[WHILE_END]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[TMP0]] to i64
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[SCEVGEP]] to i64
 ; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STR]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = sub i64 [[TMP10]], [[SUB_PTR_RHS_CAST]]
-; CHECK-NEXT:    tail call void @use(ptr [[TMP0]])
-; CHECK-NEXT:    tail call void @use(ptr [[STR]])
-; CHECK-NEXT:    ret i64 [[TMP13]]
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    ret i64 [[SUB_PTR_SUB]]
 ;
 entry:
   br label %while.cond
 
-while.cond:                                       ; preds = %while.cond, %entry
+while.cond:
   %str.addr.0 = phi ptr [ %str, %entry ], [ %incdec.ptr, %while.cond ]
   %0 = load i8, ptr %str.addr.0, align 1
   %cmp.not = icmp eq i8 %0, 0
-  %incdec.ptr = getelementptr inbounds i8, ptr %str.addr.0, i64 1
+  %incdec.ptr = getelementptr i8, ptr %str.addr.0, i64 1
   br i1 %cmp.not, label %while.end, label %while.cond
 
-while.end:                                        ; preds = %while.cond
+while.end:
   %sub.ptr.lhs.cast = ptrtoint ptr %str.addr.0 to i64
   %sub.ptr.rhs.cast = ptrtoint ptr %str to i64
   %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
-  tail call void @use(ptr %str.addr.0)
-  tail call void @use(ptr %str)
   ret i64 %sub.ptr.sub
 }
 
-define i64 @valid_strlen_4(ptr %0) {
-; CHECK-LABEL: define i64 @valid_strlen_4(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[BB10:.*]], label %[[DOTPREHEADER:.*]]
-; CHECK:       [[_PREHEADER:.*:]]
-; CHECK-NEXT:    [[NEWGEP:%.*]] = getelementptr i8, ptr [[TMP0]], i64 0
-; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[NEWGEP]])
-; CHECK-NEXT:    [[END:%.*]] = getelementptr i8, ptr [[NEWGEP]], i64 [[STRLEN]]
-; CHECK-NEXT:    br label %[[BB3:.*]]
-; CHECK:       [[BB3]]:
-; CHECK-NEXT:    [[TMP4:%.*]] = phi ptr [ poison, %[[BB3]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 poison, 0
-; CHECK-NEXT:    br i1 true, label %[[BB6:.*]], label %[[BB3]]
-; CHECK:       [[BB6]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i64 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    br label %[[BB10]]
-; CHECK:       [[BB10]]:
-; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP9]], %[[BB6]] ], [ 0, [[TMP1:%.*]] ]
-; CHECK-NEXT:    ret i64 [[TMP11]]
+; int valid_basic_strlen_rotated(const char* str) {
+;     const char* base = str;
+;     if (!*str) return 0;
+;     do {
+;         ++str;
+;     } while (*str);
+;     return str - base;
+; }
+define i32 @valid_basic_strlen_rotated(ptr %str) {
+; CHECK-LABEL: define i32 @valid_basic_strlen_rotated(
+; CHECK-SAME: ptr [[STR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[STR]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[CLEANUP:.*]], label %[[DO_BODY_PREHEADER:.*]]
+; CHECK:       [[DO_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[STR]], i64 1
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[SCEVGEP]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[STRLEN]], 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[STR]], i64 [[TMP1]]
+; CHECK-NEXT:    br label %[[DO_BODY:.*]]
+; CHECK:       [[DO_BODY]]:
+; CHECK-NEXT:    [[STR_ADDR_0:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], %[[DO_BODY]] ], [ [[STR]], %[[DO_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[STR_ADDR_0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    [[TOBOOL1_NOT:%.*]] = icmp eq i8 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL1_NOT]], label %[[DO_END:.*]], label %[[DO_BODY]]
+; CHECK:       [[DO_END]]:
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[SCEVGEP1]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STR]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[SUB_PTR_SUB]] to i32
+; CHECK-NEXT:    br label %[[CLEANUP]]
+; CHECK:       [[CLEANUP]]:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[CONV]], %[[DO_END]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
-  %2 = icmp eq ptr %0, null
-  br i1 %2, label %12, label %3
+entry:
+  %0 = load i8, ptr %str, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  br i1 %tobool.not, label %cleanup, label %do.body
+
+do.body:
+  %str.addr.0 = phi ptr [ %incdec.ptr, %do.body ], [ %str, %entry ]
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr %str.addr.0, i64 1
+  %1 = load i8, ptr %incdec.ptr, align 1
+  %tobool1.not = icmp eq i8 %1, 0
+  br i1 %tobool1.not, label %do.end, label %do.body
 
-3:                                                ; preds = %1, %3
-  %4 = phi ptr [ %5, %3 ], [ %0, %1 ]
-  %5 = getelementptr inbounds i8, ptr %4, i64 1
-  %6 = load i8, ptr %5, align 1
-  %7 = icmp eq i8 %6, 0
-  br i1 %7, label %8, label %3
+do.end:
+  %sub.ptr.lhs.cast = ptrtoint ptr %incdec.ptr to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %str to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %conv = trunc i64 %sub.ptr.sub to i32
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i32 [ %conv, %do.end ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+; int valid_strlen_with_aux_indvar(const char* str) {
+;   int count = 0;
+;   int count_offset = -10;
+;   int count_multiple = 0;
+;
+;   while (*str) {
+;     ++str;
+;     ++count;
+;     ++count_offset;
+;     count_multiple += 2;
+;     ++foo;
+;   }
+;
+;   usei(count);
+;   usei(count_offset);
+;   usei(count_multiple);
+;   use(str);
+;   use(foo);
+; }
+define dso_local void @valid_strlen_with_aux_indvar(ptr noundef %str, ptr noundef %foo) local_unnamed_addr {
+; CHECK-LABEL: define dso_local void @valid_strlen_with_aux_indvar(
+; CHECK-SAME: ptr noundef [[STR:%.*]], ptr noundef [[FOO:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[STR]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT9:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT9]], label %[[WHILE_END:.*]], label %[[WHILE_BODY_PREHEADER:.*]]
+; CHECK:       [[WHILE_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[STR]], i64 1
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[SCEVGEP]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[STRLEN]], 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[STR]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[STRLEN]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[STRLEN]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP4]], -9
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[STRLEN]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[STRLEN]], 1
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[FOO]], i64 [[TMP9]]
+; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
+; CHECK:       [[WHILE_BODY]]:
+; CHECK-NEXT:    [[COUNT_MULTIPLE_014:%.*]] = phi i32 [ [[ADD:%.*]], %[[WHILE_BODY]] ], [ 0, %[[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COUNT_OFFSET_013:%.*]] = phi i32 [ [[INC1:%.*]], %[[WHILE_BODY]] ], [ -10, %[[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COUNT_012:%.*]] = phi i32 [ [[INC:%.*]], %[[WHILE_BODY]] ], [ 0, %[[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[FOO_ADDR_011:%.*]] = phi ptr [ [[INCDEC_PTR2:%.*]], %[[WHILE_BODY]] ], [ [[FOO]], %[[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[STR_ADDR_010:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], %[[WHILE_BODY]] ], [ [[STR]], %[[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[STR_ADDR_010]], i64 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[COUNT_012]], 1
+; CHECK-NEXT:    [[INC1]] = add nsw i32 [[COUNT_OFFSET_013]], 1
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i32 [[COUNT_MULTIPLE_014]], 2
+; CHECK-NEXT:    [[INCDEC_PTR2]] = getelementptr inbounds nuw i8, ptr [[FOO_ADDR_011]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP10]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]]
+; CHECK:       [[WHILE_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[WHILE_END]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[STR_ADDR_0_LCSSA:%.*]] = phi ptr [ [[STR]], %[[ENTRY]] ], [ [[SCEVGEP1]], %[[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[FOO_ADDR_0_LCSSA:%.*]] = phi ptr [ [[FOO]], %[[ENTRY]] ], [ [[SCEVGEP2]], %[[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[COUNT_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP3]], %[[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[COUNT_OFFSET_0_LCSSA:%.*]] = phi i32 [ -10, %[[ENTRY]] ], [ [[TMP5]], %[[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[COUNT_MULTIPLE_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP8]], %[[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    tail call void @usei(i32 noundef [[COUNT_0_LCSSA]])
+; CHECK-NEXT:    tail call void @usei(i32 noundef [[COUNT_OFFSET_0_LCSSA]])
+; CHECK-NEXT:    tail call void @usei(i32 noundef [[COUNT_MULTIPLE_0_LCSSA]])
+; CHECK-NEXT:    tail call void @use(ptr noundef nonnull [[STR_ADDR_0_LCSSA]])
+; CHECK-NEXT:    tail call void @use(ptr noundef [[FOO_ADDR_0_LCSSA]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i8, ptr %str, align 1
+  %tobool.not9 = icmp eq i8 %0, 0
+  br i1 %tobool.not9, label %while.end, label %while.body
 
-8:                                                ; preds = %3
-  %9 = ptrtoint ptr %5 to i64
-  %10 = ptrtoint ptr %0 to i64
-  %11 = sub i64 %9, %10
-  br label %12
+while.body:
+  %count_multiple.014 = phi i32 [ %add, %while.body ], [ 0, %entry ]
+  %count_offset.013 = phi i32 [ %inc1, %while.body ], [ -10, %entry ]
+  %count.012 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %foo.addr.011 = phi ptr [ %incdec.ptr2, %while.body ], [ %foo, %entry ]
+  %str.addr.010 = phi ptr [ %incdec.ptr, %while.body ], [ %str, %entry ]
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr %str.addr.010, i64 1
+  %inc = add nuw nsw i32 %count.012, 1
+  %inc1 = add nsw i32 %count_offset.013, 1
+  %add = add nuw nsw i32 %count_multiple.014, 2
+  %incdec.ptr2 = getelementptr inbounds nuw i8, ptr %foo.addr.011, i64 1
+  %1 = load i8, ptr %incdec.ptr, align 1
+  %tobool.not = icmp eq i8 %1, 0
+  br i1 %tobool.not, label %while.end, label %while.body
 
-12:                                               ; preds = %1, %8
-  %13 = phi i64 [ %11, %8 ], [ 0, %1 ]
-  ret i64 %13
+while.end:
+  %str.addr.0.lcssa = phi ptr [ %str, %entry ], [ %incdec.ptr, %while.body ]
+  %foo.addr.0.lcssa = phi ptr [ %foo, %entry ], [ %incdec.ptr2, %while.body ]
+  %count.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ]
+  %count_offset.0.lcssa = phi i32 [ -10, %entry ], [ %inc1, %while.body ]
+  %count_multiple.0.lcssa = phi i32 [ 0, %entry ], [ %add, %while.body ]
+  tail call void @usei(i32 noundef %count.0.lcssa) #3
+  tail call void @usei(i32 noundef %count_offset.0.lcssa) #3
+  tail call void @usei(i32 noundef %count_multiple.0.lcssa) #3
+  tail call void @use(ptr noundef nonnull %str.addr.0.lcssa) #3
+  tail call void @use(ptr noundef %foo.addr.0.lcssa) #3
+  ret void
 }
 
-define i64 @valid_strlen_use(ptr %str) {
-; CHECK-LABEL: define i64 @valid_strlen_use(
+; int valid_strlen_index(const char* str) {
+;     int i = 0;
+;     while (str[i]) {
+;         ++i;
+;     }
+;     return i;
+; }
+define i32 @valid_strlen_index(ptr %str) {
+; CHECK-LABEL: define i32 @valid_strlen_index(
 ; CHECK-SAME: ptr [[STR:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[STR]])
-; CHECK-NEXT:    [[END:%.*]] = getelementptr i8, ptr [[STR]], i64 [[STRLEN]]
 ; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
 ; CHECK:       [[WHILE_COND]]:
-; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 poison, 0
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, ptr poison, i64 1
-; CHECK-NEXT:    br i1 true, label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_COND]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[STR]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_COND]]
 ; CHECK:       [[WHILE_END]]:
-; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STR]] to i64
-; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
-; CHECK-NEXT:    tail call void @use(ptr noundef nonnull [[END]])
-; CHECK-NEXT:    tail call void @use(ptr noundef [[STR]])
-; CHECK-NEXT:    ret i64 [[SUB_PTR_SUB]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nuw nsw i64 [[STRLEN]] to i32
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
   br label %while.cond
 
-while.cond:                                       ; preds = %while.cond, %entry
-  %str.addr.0 = phi ptr [ %str, %entry ], [ %incdec.ptr, %while.cond ]
-  %0 = load i8, ptr %str.addr.0, align 1
-  %cmp.not = icmp eq i8 %0, 0
-  %incdec.ptr = getelementptr inbounds i8, ptr %str.addr.0, i64 1
-  br i1 %cmp.not, label %while.end, label %while.cond
+while.cond:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %while.cond ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %str, i64 %indvars.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %tobool.not = icmp eq i8 %0, 0
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br i1 %tobool.not, label %while.end, label %while.cond
 
-while.end:                                        ; preds = %while.cond
-  %sub.ptr.lhs.cast = ptrtoint ptr %str.addr.0 to i64
-  %sub.ptr.rhs.cast = ptrtoint ptr %str to i64
-  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
-  tail call void @use(ptr noundef nonnull %str.addr.0)
-  tail call void @use(ptr noundef %str)
-  ret i64 %sub.ptr.sub
+while.end:
+  %1 = trunc nuw nsw i64 %indvars.iv to i32
+  ret i32 %1
 }
 
-define i64 @invalid_strlen_has_side_effect(ptr %0) {
-; CHECK-LABEL: define i64 @invalid_strlen_has_side_effect(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    br label %[[BB2:.*]]
-; CHECK:       [[BB2]]:
-; CHECK-NEXT:    [[TMP3:%.*]] = phi ptr [ [[TMP0]], [[TMP1:%.*]] ], [ [[TMP6:%.*]], %[[BB2]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6]] = getelementptr inbounds i8, ptr [[TMP3]], i64 1
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[BB7:.*]], label %[[BB2]]
-; CHECK:       [[BB7]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi ptr [ [[TMP3]], %[[BB2]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[DOTLCSSA]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[TMP0]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    ret i64 [[TMP10]]
+; void valid_strlen_offset(const my_char* str) {
+;     if (*(str++) == '\0') return;
+;     if (*(str++) == '\0') return;
+;     if (*(str++) == '\0') return;
+;     while (*str) {
+;         ++str;
+;     }
+;     use(str);
+; }
+define dso_local void @valid_strlen_offset(ptr noundef %str) local_unnamed_addr {
+; CHECK-LABEL: define dso_local void @valid_strlen_offset(
+; CHECK-SAME: ptr noundef [[STR:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[STR]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label %[[RETURN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[STR]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[CMP4]], label %[[RETURN]], label %[[IF_END7:.*]]
+; CHECK:       [[IF_END7]]:
+; CHECK-NEXT:    [[INCDEC_PTR2:%.*]] = getelementptr inbounds nuw i8, ptr [[STR]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[INCDEC_PTR2]], align 1
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp eq i8 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[CMP10]], label %[[RETURN]], label %[[WHILE_COND_PREHEADER:.*]]
+; CHECK:       [[WHILE_COND_PREHEADER]]:
+; CHECK-NEXT:    [[INCDEC_PTR8:%.*]] = getelementptr i8, ptr [[STR]], i64 3
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[INCDEC_PTR8]])
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[STRLEN]], 3
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[STR]], i64 [[TMP3]]
+; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+; CHECK:       [[WHILE_COND]]:
+; CHECK-NEXT:    [[STR_ADDR_0:%.*]] = phi ptr [ [[INCDEC_PTR14:%.*]], %[[WHILE_COND]] ], [ [[INCDEC_PTR8]], %[[WHILE_COND_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[STR_ADDR_0]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP4]], 0
+; CHECK-NEXT:    [[INCDEC_PTR14]] = getelementptr inbounds nuw i8, ptr [[STR_ADDR_0]], i64 1
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    tail call void @use(ptr noundef nonnull [[SCEVGEP]])
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    ret void
 ;
-  br label %2
+entry:
+  %0 = load i8, ptr %str, align 1
+  %cmp = icmp eq i8 %0, 0
+  br i1 %cmp, label %return, label %if.end
+
+if.end:
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr %str, i64 1
+  %1 = load i8, ptr %incdec.ptr, align 1
+  %cmp4 = icmp eq i8 %1, 0
+  br i1 %cmp4, label %return, label %if.end7
 
-2:                                                ; preds = %2, %1
-  %3 = phi ptr [ %0, %1 ], [ %6, %2 ]
-  %4 = load volatile i8, ptr %3, align 1
-  %5 = icmp eq i8 %4, 0
-  %6 = getelementptr inbounds i8, ptr %3, i64 1
-  br i1 %5, label %7, label %2
+if.end7:
+  %incdec.ptr2 = getelementptr inbounds nuw i8, ptr %str, i64 2
+  %2 = load i8, ptr %incdec.ptr2, align 1
+  %cmp10 = icmp eq i8 %2, 0
+  br i1 %cmp10, label %return, label %while.cond.preheader
+
+while.cond.preheader:
+  %incdec.ptr8 = getelementptr inbounds nuw i8, ptr %str, i64 3
+  br label %while.cond
 
-7:                                                ; preds = %2
-  %8 = ptrtoint ptr %3 to i64
-  %9 = ptrtoint ptr %0 to i64
-  %10 = sub i64 %8, %9
-  ret i64 %10
+while.cond:
+  %str.addr.0 = phi ptr [ %incdec.ptr14, %while.cond ], [ %incdec.ptr8, %while.cond.preheader ]
+  %3 = load i8, ptr %str.addr.0, align 1
+  %tobool.not = icmp eq i8 %3, 0
+  %incdec.ptr14 = getelementptr inbounds nuw i8, ptr %str.addr.0, i64 1
+  br i1 %tobool.not, label %while.end, label %while.cond
+
+while.end:
+  tail call void @use(ptr noundef nonnull %str.addr.0) #3
+  br label %return
+
+return:
+  ret void
 }
 
-define i64 @invalid_strlen_idx_idiom(ptr %0) {
-; CHECK-LABEL: define i64 @invalid_strlen_idx_idiom(
-; CHECK-SAME: ptr [[TMP0:%.*]]) {
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[TMP0]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[TMP3]], label %[[BB13:.*]], label %[[DOTPREHEADER:.*]]
-; CHECK:       [[_PREHEADER:.*:]]
-; CHECK-NEXT:    br label %[[BB4:.*]]
-; CHECK:       [[BB4]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP7:%.*]], %[[BB4]] ], [ 0, %[[DOTPREHEADER]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[TMP8:%.*]], %[[BB4]] ], [ [[TMP0]], %[[DOTPREHEADER]] ]
-; CHECK-NEXT:    [[TMP7]] = add nuw nsw i32 [[TMP5]], 1
-; CHECK-NEXT:    [[TMP8]] = getelementptr inbounds i8, ptr [[TMP6]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i8 [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[BB11:.*]], label %[[BB4]]
-; CHECK:       [[BB11]]:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP7]], %[[BB4]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = zext nneg i32 [[DOTLCSSA]] to i64
-; CHECK-NEXT:    br label %[[BB13]]
-; CHECK:       [[BB13]]:
-; CHECK-NEXT:    [[TMP14:%.*]] = phi i64 [ 0, [[TMP1:%.*]] ], [ [[TMP12]], %[[BB11]] ]
-; CHECK-NEXT:    ret i64 [[TMP14]]
+; void valid_nested_idiom(const char** strs, int n) {
+;     for (int i = 0; i < n; ++i) {
+;         const char* s = strs[i];
+;         int count = 0;
+;         while (*s) {
+;             ++s;
+;             ++count;
+;         }
+;         usei(count);
+;     }
+; }
+define void @valid_nested_idiom(ptr %strs, i32 %n) {
+; CHECK-LABEL: define void @nested_idiom(
+; CHECK-SAME: ptr [[STRS:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_END:.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[STRS]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[TMP0]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT6:%.*]] = icmp eq i8 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT6]], label %[[WHILE_END]], label %[[WHILE_BODY_PREHEADER:.*]]
+; CHECK:       [[WHILE_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP0]], i64 1
+; CHECK-NEXT:    [[STRLEN:%.*]] = call i64 @strlen(ptr [[SCEVGEP]])
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[STRLEN]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
+; CHECK:       [[WHILE_BODY]]:
+; CHECK-NEXT:    [[COUNT_08:%.*]] = phi i32 [ [[INC:%.*]], %[[WHILE_BODY]] ], [ 0, %[[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[S_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], %[[WHILE_BODY]] ], [ [[TMP0]], %[[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[S_07]], i64 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[COUNT_08]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i8 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]]
+; CHECK:       [[WHILE_END_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[WHILE_END]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[COUNT_0_LCSSA:%.*]] = phi i32 [ 0, %[[FOR_BODY]] ], [ [[TMP3]], %[[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    tail call void @usei(i32 [[COUNT_0_LCSSA]])
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
 ;
-  %2 = load i8, ptr %0, align 1
-  %3 = icmp eq i8 %2, 0
-  br i1 %3, label %13, label %4
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
 
-4:                                                ; preds = %1, %4
-  %5 = phi i32 [ %7, %4 ], [ 0, %1 ]
-  %6 = phi ptr [ %8, %4 ], [ %0, %1 ]
-  %7 = add nuw nsw i32 %5, 1
-  %8 = getelementptr inbounds i8, ptr %6, i64 1
-  %9 = load i8, ptr %8, align 1
-  %10 = icmp eq i8 %9, 0
-  br i1 %10, label %11, label %4
+for.body.preheader:
+  %wide.trip.count = zext nneg i32 %n to i64
+  br label %for.body
 
-11:                                               ; preds = %4
-  %12 = zext nneg i32 %7 to i64
-  br label %13
+for.cond.cleanup:
+  ret void
 
-13:                                               ; preds = %11, %1
-  %14 = phi i64 [ 0, %1 ], [ %12, %11 ]
-  ret i64 %14
-}
+for.body:
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %while.end ]
+  %arrayidx = getelementptr inbounds ptr, ptr %strs, i64 %indvars.iv
+  %0 = load ptr, ptr %arrayidx, align 8
+  %1 = load i8, ptr %0, align 1
+  %tobool.not6 = icmp eq i8 %1, 0
+  br i1 %tobool.not6, label %while.end, label %while.body
 
+while.body:
+  %count.08 = phi i32 [ %inc, %while.body ], [ 0, %for.body ]
+  %s.07 = phi ptr [ %incdec.ptr, %while.body ], [ %0, %for.body ]
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr %s.07, i64 1
+  %inc = add nuw nsw i32 %count.08, 1
+  %2 = load i8, ptr %incdec.ptr, align 1
+  %tobool.not = icmp eq i8 %2, 0
+  br i1 %tobool.not, label %while.end, label %while.body
 
+while.end:
+  %count.0.lcssa = phi i32 [ 0, %for.body ], [ %inc, %while.body ]
+  tail call void @usei(i32 %count.0.lcssa) #2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}

>From 7c7437fc82edb63f6ff79abcf60369069aadf0ac Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Sun, 2 Feb 2025 01:00:48 -0500
Subject: [PATCH 08/13] Refactor strlen detection

---
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 279 ++++++++----------
 1 file changed, 126 insertions(+), 153 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 8a898bc7bd5a15..47e34427be4077 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1544,141 +1544,141 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
   return nullptr;
 }
 
-struct StrlenIdiom {
-  unsigned IdiomSize;
-  ConstantInt *StepSize;
-  const SCEV *LoadBaseEv;
-  Type *LoadType;
-};
-
-/// Trying to detect strlen idiom that increments a char pointer
-/// with a single loop body bb.
-static bool detectStrLenIdiom(const Loop *CurLoop, ScalarEvolution *SE,
-                              const TargetLibraryInfo *TLI,
-                              StrlenIdiom &Idiom) {
-  /*
-  outs() << "current loop:\n";
-  CurLoop->print(outs());
-  outs() << "\n";
-  */
-
-  // Give up if the loop has multiple blocks, multiple backedges, or
-  // multiple exit blocks
-  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1 ||
-      !CurLoop->getUniqueExitBlock())
-    return false;
+namespace {
 
-  // It should have a preheader and a branch instruction.
-  BasicBlock *Preheader = CurLoop->getLoopPreheader();
-  if (!Preheader)
-    return false;
+class StrlenVerifier {
+public:
+  explicit StrlenVerifier(const Loop *CurLoop, ScalarEvolution *SE,
+                          const TargetLibraryInfo *TLI)
+      : CurLoop(CurLoop), SE(SE), TLI(TLI) {}
+
+  bool isValidStrlenIdiom() {
+    // Give up if the loop has multiple blocks, multiple backedges, or
+    // multiple exit blocks
+    if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1 ||
+        !CurLoop->getUniqueExitBlock())
+      return false;
 
-  BranchInst *EntryBI = dyn_cast<BranchInst>(Preheader->getTerminator());
-  if (!EntryBI)
-    return false;
+    // It should have a preheader and a branch instruction.
+    BasicBlock *Preheader = CurLoop->getLoopPreheader();
+    if (!Preheader)
+      return false;
 
-  // The loop exit must be conditioned on an icmp with 0 the null terminator.
-  // The icmp operand has to be a load on some SSA reg that increments
-  // by 1 in the loop.
-  BasicBlock *LoopBody = *CurLoop->block_begin();
+    BranchInst *EntryBI = dyn_cast<BranchInst>(Preheader->getTerminator());
+    if (!EntryBI)
+      return false;
 
-  // Skip if the body is too big as it most likely is not a strlen idiom.
-  if (!LoopBody || LoopBody->size() >= 15)
-    return false;
+    // The loop exit must be conditioned on an icmp with 0 the null terminator.
+    // The icmp operand has to be a load on some SSA reg that increments
+    // by 1 in the loop.
+    BasicBlock *LoopBody = *CurLoop->block_begin();
 
-  BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
-  Value *LoopCond = matchCondition(LoopTerm, LoopBody);
-  if (!LoopCond)
-    return false;
+    // Skip if the body is too big as it most likely is not a strlen idiom.
+    if (!LoopBody || LoopBody->size() >= 15)
+      return false;
 
-  auto *LoopLoad = dyn_cast<LoadInst>(LoopCond);
-  if (!LoopLoad || LoopLoad->getPointerAddressSpace() != 0)
-    return false;
+    BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
+    Value *LoopCond = matchCondition(LoopTerm, LoopBody);
+    if (!LoopCond)
+      return false;
 
-  Type *OperandType = LoopLoad->getType();
-  if (!OperandType || !OperandType->isIntegerTy())
-    return false;
+    auto *LoopLoad = dyn_cast<LoadInst>(LoopCond);
+    if (!LoopLoad || LoopLoad->getPointerAddressSpace() != 0)
+      return false;
 
-  // See if the pointer expression is an AddRec with constant step a of form
-  // ({n,+,a}) where a is the width of the char type.
-  auto *IncPtr = LoopLoad->getPointerOperand();
-  const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IncPtr));
-  if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
-    return false;
+    OperandType = LoopLoad->getType();
+    if (!OperandType || !OperandType->isIntegerTy())
+      return false;
 
-  /*
-  outs() << "pointer load ev: ";
-  LoadEv->print(outs());
-  outs() << "\n";
-  */
+    // See if the pointer expression is an AddRec with constant step a of form
+    // ({n,+,a}) where a is the width of the char type.
+    Value *IncPtr = LoopLoad->getPointerOperand();
+    const SCEVAddRecExpr *LoadEv =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IncPtr));
+    if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+      return false;
+    LoadBaseEv = LoadEv->getStart();
 
-  const SCEVConstant *Step =
-      dyn_cast<SCEVConstant>(LoadEv->getStepRecurrence(*SE));
-  if (!Step)
-    return false;
+    LLVM_DEBUG({
+      dbgs() << "pointer load scev: ";
+      LoadEv->print(outs());
+      dbgs() << "\n";
+    });
 
-  unsigned StepSize = 0;
-  ConstantInt *StepSizeCI = dyn_cast<ConstantInt>(Step->getValue());
-  if (!StepSizeCI)
-    return false;
-  StepSize = StepSizeCI->getZExtValue();
+    const SCEVConstant *Step =
+        dyn_cast<SCEVConstant>(LoadEv->getStepRecurrence(*SE));
+    if (!Step)
+      return false;
 
-  // Verify that StepSize is consistent with platform char width.
-  unsigned OpWidth = OperandType->getIntegerBitWidth();
-  unsigned WcharSize = TLI->getWCharSize(*LoopLoad->getModule());
-  if (OpWidth != StepSize * 8)
-    return false;
-  if (OpWidth != 8 && OpWidth != 16 && OpWidth != 32)
-    return false;
-  if (OpWidth >= 16)
-    if (OpWidth != WcharSize * 8)
+    unsigned StepSize = 0;
+    StepSizeCI = dyn_cast<ConstantInt>(Step->getValue());
+    if (!StepSizeCI)
       return false;
+    StepSize = StepSizeCI->getZExtValue();
 
-  // Scan every instruction in the loop to ensure there are no side effects.
-  for (auto &I : *LoopBody)
-    if (I.mayHaveSideEffects())
+    // Verify that StepSize is consistent with platform char width.
+    OpWidth = OperandType->getIntegerBitWidth();
+    unsigned WcharSize = TLI->getWCharSize(*LoopLoad->getModule());
+    if (OpWidth != StepSize * 8)
       return false;
+    if (OpWidth != 8 && OpWidth != 16 && OpWidth != 32)
+      return false;
+    if (OpWidth >= 16)
+      if (OpWidth != WcharSize * 8)
+        return false;
 
-  auto *LoopExitBB = CurLoop->getExitBlock();
-  if (!LoopExitBB)
-    return false;
+    // Scan every instruction in the loop to ensure there are no side effects.
+    for (auto &I : *LoopBody)
+      if (I.mayHaveSideEffects())
+        return false;
 
-  for (PHINode &PN : LoopExitBB->phis()) {
-    if (!SE->isSCEVable(PN.getType()))
+    BasicBlock *LoopExitBB = CurLoop->getExitBlock();
+    if (!LoopExitBB)
       return false;
 
-    const SCEV *Ev = SE->getSCEV(&PN);
-    /*
-    outs() << "loop exit block scev exprs: ";
-    PN.print(outs());
-    if (Ev)
-      Ev->print(outs());
-    outs() << "\n";
-    */
-
-    if (!Ev)
-      return false;
+    for (PHINode &PN : LoopExitBB->phis()) {
+      if (!SE->isSCEVable(PN.getType()))
+        return false;
 
-    // Since we verified that the loop trip count will be a valid strlen idiom,
-    // we can expand all lcssa phi with {n,+,1} as (n + strlen) and use
-    // SCEVExpander materialize the loop output.
-    const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
-    if (!AddRecEv || !AddRecEv->isAffine())
-      return false;
+      const SCEV *Ev = SE->getSCEV(&PN);
+      if (!Ev)
+        return false;
 
-    // We only want RecAddExpr with recurrence step that are constant. This
-    // is good enough for all the idioms we want to recognize. Later we expand
-    // the recurrence as {base,+,a} -> (base + a * strlen) and materialize
-    if (!dyn_cast<SCEVConstant>(AddRecEv->getStepRecurrence(*SE)))
-      return false;
+      LLVM_DEBUG({
+        dbgs() << "loop exit phi scev: ";
+        Ev->print(dbgs());
+        dbgs() << "\n";
+      });
+
+      // Since we verified that the loop trip count will be a valid strlen
+      // idiom, we can expand all lcssa phi with {n,+,1} as (n + strlen) and use
+      // SCEVExpander materialize the loop output.
+      const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
+      if (!AddRecEv || !AddRecEv->isAffine())
+        return false;
+
+      // We only want RecAddExpr with recurrence step that are constant. This
+      // is good enough for all the idioms we want to recognize. Later we expand
+      // the recurrence as {base,+,a} -> (base + a * strlen) and materialize
+      if (!dyn_cast<SCEVConstant>(AddRecEv->getStepRecurrence(*SE)))
+        return false;
+    }
+
+    return true;
   }
 
-  Idiom.LoadBaseEv = LoadEv->getStart();
-  Idiom.IdiomSize = OpWidth;
-  Idiom.StepSize = StepSizeCI;
-  Idiom.LoadType = OperandType;
-  return true;
-}
+public:
+  const Loop *CurLoop;
+  ScalarEvolution *SE;
+  const TargetLibraryInfo *TLI;
+
+  unsigned OpWidth;
+  ConstantInt *StepSizeCI;
+  const SCEV *LoadBaseEv;
+  Type *OperandType;
+};
+
+} // namespace
 
 /// Recognizes a strlen idiom by checking for loops that increment
 /// a char pointer and then subtract with the base pointer.
@@ -1702,21 +1702,13 @@ static bool detectStrLenIdiom(const Loop *CurLoop, ScalarEvolution *SE,
 ///
 /// Later the pointer subtraction will be folded by InstCombine
 bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
-  /*
-  const auto *First = CurLoop->block_begin();
-  if (First != CurLoop->block_end()) {
-    auto *F = (*First)->getParent();
-    outs() << "\n\n\n\n\n========== NEW LOOP ============\n";
-    F->print(outs());
-  }
-  */
-
-  // TODO: check for disable options
-  StrlenIdiom Idiom;
-  if (!detectStrLenIdiom(CurLoop, SE, TLI, Idiom))
+  if (DisableLIRP::All)
     return false;
 
-  // outs() << "idiom is good\n\n";
+  StrlenVerifier Verifier(CurLoop, SE, TLI);
+
+  if (!Verifier.isValidStrlenIdiom())
+    return false;
 
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
   BasicBlock *LoopExitBB = CurLoop->getExitBlock();
@@ -1724,18 +1716,17 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, Preheader->getModule()->getDataLayout(), "scev");
   Value *MaterialzedBase = Expander.expandCodeFor(
-      Idiom.LoadBaseEv, Idiom.LoadBaseEv->getType(), Builder.GetInsertPoint());
+      Verifier.LoadBaseEv, Verifier.LoadBaseEv->getType(),
+      Builder.GetInsertPoint());
 
   Value *StrLenFunc = nullptr;
-  switch (Idiom.IdiomSize) {
-  case 8:
+  if (Verifier.OpWidth == 8) {
     if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_strlen))
       return false;
     StrLenFunc = emitStrLen(MaterialzedBase, Builder, *DL, TLI);
-    break;
-  case 16:
-  case 32:
-    if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_wcslen))
+  } else {
+    if (!isLibFuncEmittable(Preheader->getModule(), TLI, LibFunc_wcslen) &&
+        !DisableLIRP::Wcslen)
       return false;
     StrLenFunc = emitWcsLen(MaterialzedBase, Builder, *DL, TLI);
   }
@@ -1750,39 +1741,21 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
         dyn_cast<SCEVConstant>(AddRecEv->getStepRecurrence(*SE));
     const SCEV *Base = AddRecEv->getStart();
 
-    /*
-    outs() << "creating new mult scev: ";
-    Base->getType()->print(outs());
-    outs() << " ";
-    Step->getType()->print(outs());
-    outs() << " ";
-    StrlenEv->getType()->print(outs());
-    outs() << "\n";
-    */
-
     // It is safe to truncate to base since if base is narrower than size_t
     // the equivalent user code will have to truncate anyways.
     const SCEV *NewEv = SE->getAddExpr(
         Base, SE->getMulExpr(Step, SE->getTruncateOrSignExtend(
                                        StrlenEv, Base->getType())));
 
-    /*
-    outs() << "new ev exprs: ";
-    PN.print(outs());
-    if (NewEv)
-      NewEv->print(outs());
-    outs() << "\n";
-    */
-
-    Expander.clear();
     Value *MaterializedPHI = Expander.expandCodeFor(NewEv, NewEv->getType(),
                                                     Builder.GetInsertPoint());
+    Expander.clear();
     PN.replaceAllUsesWith(MaterializedPHI);
     Cleanup.push_back(&PN);
   }
 
-  // All LCSSA Loop Phi are dead, the left over loop body can be cleaned up by
-  // later passes
+  // All LCSSA Loop Phi are dead, the left over dead loop body can be cleaned 
+  // up by later passes
   for (PHINode *PN : Cleanup) {
     RecursivelyDeleteDeadPHINode(PN);
   }

>From 6226db2c22a74e198f5f9ff376b1a8259edbdda7 Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Sun, 2 Feb 2025 13:14:28 -0500
Subject: [PATCH 09/13] update tests

---
 llvm/test/Transforms/LoopIdiom/strlen.ll   | 192 ++++++++++++++++++++-
 llvm/test/Transforms/LoopIdiom/wcslen16.ll |  72 +++++++-
 llvm/test/Transforms/LoopIdiom/wcslen32.ll |  74 +++++++-
 3 files changed, 326 insertions(+), 12 deletions(-)

diff --git a/llvm/test/Transforms/LoopIdiom/strlen.ll b/llvm/test/Transforms/LoopIdiom/strlen.ll
index 0dc833ec0e35f3..137a17f541cd46 100644
--- a/llvm/test/Transforms/LoopIdiom/strlen.ll
+++ b/llvm/test/Transforms/LoopIdiom/strlen.ll
@@ -345,7 +345,7 @@ return:
 ;     }
 ; }
 define void @valid_nested_idiom(ptr %strs, i32 %n) {
-; CHECK-LABEL: define void @nested_idiom(
+; CHECK-LABEL: define void @valid_nested_idiom(
 ; CHECK-SAME: ptr [[STRS:%.*]], i32 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
@@ -422,3 +422,193 @@ while.end:
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
+
+define i64 @invalid_strlen_has_side_effects(ptr %str) {
+; CHECK-LABEL: define i64 @invalid_strlen_has_side_effects(
+; CHECK-SAME: ptr [[STR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+; CHECK:       [[WHILE_COND]]:
+; CHECK-NEXT:    [[STR_ADDR_0:%.*]] = phi ptr [ [[STR]], %[[ENTRY]] ], [ [[INCDEC_PTR:%.*]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load volatile i8, ptr [[STR_ADDR_0]], align 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr i8, ptr [[STR_ADDR_0]], i64 1
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[STR_ADDR_0_LCSSA:%.*]] = phi ptr [ [[STR_ADDR_0]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[STR_ADDR_0_LCSSA]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STR]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    ret i64 [[SUB_PTR_SUB]]
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %str.addr.0 = phi ptr [ %str, %entry ], [ %incdec.ptr, %while.cond ]
+  %0 = load volatile i8, ptr %str.addr.0, align 1
+  %cmp.not = icmp eq i8 %0, 0
+  %incdec.ptr = getelementptr i8, ptr %str.addr.0, i64 1
+  br i1 %cmp.not, label %while.end, label %while.cond
+
+while.end:
+  %sub.ptr.lhs.cast = ptrtoint ptr %str.addr.0 to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %str to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  ret i64 %sub.ptr.sub
+}
+
+
+define i8 @invalid_exit_phi_scev(ptr %str) {
+; CHECK-LABEL: define i8 @invalid_exit_phi_scev(
+; CHECK-SAME: ptr [[STR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+; CHECK:       [[WHILE_COND]]:
+; CHECK-NEXT:    [[STR_ADDR_0:%.*]] = phi ptr [ [[STR]], %[[ENTRY]] ], [ [[INCDEC_PTR:%.*]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[STR_ADDR_0]], align 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr i8, ptr [[STR_ADDR_0]], i64 1
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[STR_ADDR_0_LCSSA:%.*]] = phi ptr [ [[STR_ADDR_0]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i8 [ [[TMP0]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[STR_ADDR_0_LCSSA]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STR]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    ret i8 [[DOTLCSSA]]
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %str.addr.0 = phi ptr [ %str, %entry ], [ %incdec.ptr, %while.cond ]
+  %0 = load i8, ptr %str.addr.0, align 1
+  %cmp.not = icmp eq i8 %0, 0
+  %incdec.ptr = getelementptr i8, ptr %str.addr.0, i64 1
+  br i1 %cmp.not, label %while.end, label %while.cond
+
+while.end:
+  %sub.ptr.lhs.cast = ptrtoint ptr %str.addr.0 to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %str to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+
+  ; %0.lcssa has invalid scev rec {%0} expected to be {%str,+,constant}
+  ret i8 %0
+}
+
+
+
+define i64 @invalid_branch_cond(ptr %str) {
+; CHECK-LABEL: define i64 @invalid_branch_cond(
+; CHECK-SAME: ptr [[STR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+; CHECK:       [[WHILE_COND]]:
+; CHECK-NEXT:    [[STR_ADDR_0:%.*]] = phi ptr [ [[STR]], %[[ENTRY]] ], [ [[INCDEC_PTR:%.*]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[STR_ADDR_0]], align 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 [[TMP0]], 10
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr i8, ptr [[STR_ADDR_0]], i64 1
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[STR_ADDR_0_LCSSA:%.*]] = phi ptr [ [[STR_ADDR_0]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[STR_ADDR_0_LCSSA]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STR]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    ret i64 [[SUB_PTR_SUB]]
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %str.addr.0 = phi ptr [ %str, %entry ], [ %incdec.ptr, %while.cond ]
+  %0 = load i8, ptr %str.addr.0, align 1
+
+  ; We compare against '\n' instead of '\0'
+  %cmp.not = icmp eq i8 %0, 10
+
+  %incdec.ptr = getelementptr i8, ptr %str.addr.0, i64 1
+  br i1 %cmp.not, label %while.end, label %while.cond
+
+while.end:
+  %sub.ptr.lhs.cast = ptrtoint ptr %str.addr.0 to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %str to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  ret i64 %sub.ptr.sub
+}
+
+define i64 @invalid_unknown_step_size(ptr %str, i64 %step) {
+; CHECK-LABEL: define i64 @invalid_unknown_step_size(
+; CHECK-SAME: ptr [[STR:%.*]], i64 [[STEP:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+; CHECK:       [[WHILE_COND]]:
+; CHECK-NEXT:    [[STR_ADDR_0:%.*]] = phi ptr [ [[STR]], %[[ENTRY]] ], [ [[INCDEC_PTR:%.*]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[STR_ADDR_0]], align 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr i8, ptr [[STR_ADDR_0]], i64 [[STEP]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[STR_ADDR_0_LCSSA:%.*]] = phi ptr [ [[STR_ADDR_0]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[STR_ADDR_0_LCSSA]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STR]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    ret i64 [[SUB_PTR_SUB]]
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %str.addr.0 = phi ptr [ %str, %entry ], [ %incdec.ptr, %while.cond ]
+  %0 = load i8, ptr %str.addr.0, align 1
+  %cmp.not = icmp eq i8 %0, 0
+  %incdec.ptr = getelementptr i8, ptr %str.addr.0, i64 %step
+  br i1 %cmp.not, label %while.end, label %while.cond
+
+while.end:
+  %sub.ptr.lhs.cast = ptrtoint ptr %str.addr.0 to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %str to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  ret i64 %sub.ptr.sub
+}
+
+declare ptr @pure(ptr) #0;
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+
+define i64 @invalid_add_rec(ptr %str) {
+; CHECK-LABEL: define i64 @invalid_add_rec(
+; CHECK-SAME: ptr [[STR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+; CHECK:       [[WHILE_COND]]:
+; CHECK-NEXT:    [[STR_ADDR_0:%.*]] = phi ptr [ [[STR]], %[[ENTRY]] ], [ [[INCDEC_PTR:%.*]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[INDIRECT:%.*]] = tail call ptr @pure(ptr [[STR_ADDR_0]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[INDIRECT]], align 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i8 [[TMP0]], 0
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr i8, ptr [[STR_ADDR_0]], i64 1
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[STR_ADDR_0_LCSSA:%.*]] = phi ptr [ [[STR_ADDR_0]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[STR_ADDR_0_LCSSA]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STR]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    ret i64 [[SUB_PTR_SUB]]
+;
+entry:
+  br label %while.cond
+
+while.cond:
+  %str.addr.0 = phi ptr [ %str, %entry ], [ %incdec.ptr, %while.cond ]
+  %indirect = tail call ptr @pure(ptr %str.addr.0)
+  %0 = load i8, ptr %indirect, align 1
+  %cmp.not = icmp eq i8 %0, 0
+  %incdec.ptr = getelementptr i8, ptr %str.addr.0, i64 1
+  br i1 %cmp.not, label %while.end, label %while.cond
+
+while.end:
+  %sub.ptr.lhs.cast = ptrtoint ptr %str.addr.0 to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %str to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  ret i64 %sub.ptr.sub
+}
+
diff --git a/llvm/test/Transforms/LoopIdiom/wcslen16.ll b/llvm/test/Transforms/LoopIdiom/wcslen16.ll
index 6c140ddf90d4e3..d3b0b8d208cd83 100644
--- a/llvm/test/Transforms/LoopIdiom/wcslen16.ll
+++ b/llvm/test/Transforms/LoopIdiom/wcslen16.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes='loop-idiom' < %s -S | FileCheck %s
+; RUN: opt -passes='loop(loop-idiom),verify' < %s -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -15,14 +15,18 @@ define i64 @valid_strlen16(ptr %src) {
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i16 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP1]], label %[[RETURN]], label %[[WHILE_COND_PREHEADER:.*]]
 ; CHECK:       [[WHILE_COND_PREHEADER]]:
-; CHECK-NEXT:    [[NEWGEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 -1
+; CHECK-NEXT:    [[NEWGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 2
 ; CHECK-NEXT:    [[WCSLEN:%.*]] = call i64 @wcslen(ptr [[NEWGEP]])
-; CHECK-NEXT:    [[END:%.*]] = getelementptr i16, ptr [[NEWGEP]], i64 [[WCSLEN]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[WCSLEN]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 2
+; CHECK-NEXT:    [[END:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]]
 ; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
 ; CHECK:       [[WHILE_COND]]:
-; CHECK-NEXT:    [[SRC_PN:%.*]] = phi ptr [ poison, %[[WHILE_COND]] ], [ [[SRC]], %[[WHILE_COND_PREHEADER]] ]
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i16 poison, 0
-; CHECK-NEXT:    br i1 true, label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK-NEXT:    [[SRC_PN:%.*]] = phi ptr [ [[CURR_0:%.*]], %[[WHILE_COND]] ], [ [[SRC]], %[[WHILE_COND_PREHEADER]] ]
+; CHECK-NEXT:    [[CURR_0]] = getelementptr inbounds i8, ptr [[SRC_PN]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[CURR_0]], align 2
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i16 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_COND]]
 ; CHECK:       [[WHILE_END]]:
 ; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[END]] to i64
 ; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[SRC]] to i64
@@ -61,6 +65,62 @@ return:                                           ; preds = %entry, %lor.lhs.fal
   ret i64 %retval.0
 }
 
+define i64 @invalid_char_size(ptr %src) {
+; CHECK-LABEL: define i64 @invalid_char_size(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[SRC]], null
+; CHECK-NEXT:    br i1 [[CMP]], label %[[RETURN:.*]], label %[[LOR_LHS_FALSE:.*]]
+; CHECK:       [[LOR_LHS_FALSE]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SRC]], align 2
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[RETURN]], label %[[WHILE_COND_PREHEADER:.*]]
+; CHECK:       [[WHILE_COND_PREHEADER]]:
+; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+; CHECK:       [[WHILE_COND]]:
+; CHECK-NEXT:    [[SRC_PN:%.*]] = phi ptr [ [[CURR_0:%.*]], %[[WHILE_COND]] ], [ [[SRC]], %[[WHILE_COND_PREHEADER]] ]
+; CHECK-NEXT:    [[CURR_0]] = getelementptr inbounds i8, ptr [[SRC_PN]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[CURR_0]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[CURR_0_LCSSA:%.*]] = phi ptr [ [[CURR_0]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[CURR_0_LCSSA]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    [[SUB_PTR_DIV:%.*]] = ashr exact i64 [[SUB_PTR_SUB]], 2
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ [[SUB_PTR_DIV]], %[[WHILE_END]] ], [ 0, %[[LOR_LHS_FALSE]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL_0]]
+;
+entry:
+  %cmp = icmp eq ptr %src, null
+  br i1 %cmp, label %return, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %0 = load i32, ptr %src, align 2
+  %cmp1 = icmp eq i32 %0, 0
+  br i1 %cmp1, label %return, label %while.cond
+
+while.cond:                                       ; preds = %lor.lhs.false, %while.cond
+  %src.pn = phi ptr [ %curr.0, %while.cond ], [ %src, %lor.lhs.false ]
+  %curr.0 = getelementptr inbounds i8, ptr %src.pn, i64 4
+  %1 = load i32, ptr %curr.0, align 4
+  %tobool.not = icmp eq i32 %1, 0
+  br i1 %tobool.not, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  %sub.ptr.lhs.cast = ptrtoint ptr %curr.0 to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %src to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %sub.ptr.div = ashr exact i64 %sub.ptr.sub, 2
+  br label %return
+
+return:                                           ; preds = %entry, %lor.lhs.false, %while.end
+  %retval.0 = phi i64 [ %sub.ptr.div, %while.end ], [ 0, %lor.lhs.false ], [ 0, %entry ]
+  ret i64 %retval.0
+}
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"wchar_size", i32 2}
 
diff --git a/llvm/test/Transforms/LoopIdiom/wcslen32.ll b/llvm/test/Transforms/LoopIdiom/wcslen32.ll
index fad4c52078967f..76936b537fad74 100644
--- a/llvm/test/Transforms/LoopIdiom/wcslen32.ll
+++ b/llvm/test/Transforms/LoopIdiom/wcslen32.ll
@@ -15,14 +15,18 @@ define i64 @valid_wcslen32(ptr %src) {
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP1]], label %[[RETURN]], label %[[WHILE_COND_PREHEADER:.*]]
 ; CHECK:       [[WHILE_COND_PREHEADER]]:
-; CHECK-NEXT:    [[NEWGEP:%.*]] = getelementptr i32, ptr [[SRC]], i64 -3
+; CHECK-NEXT:    [[NEWGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 4
 ; CHECK-NEXT:    [[WCSLEN:%.*]] = call i64 @wcslen(ptr [[NEWGEP]])
-; CHECK-NEXT:    [[END:%.*]] = getelementptr i32, ptr [[NEWGEP]], i64 [[WCSLEN]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[WCSLEN]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
+; CHECK-NEXT:    [[END:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]]
 ; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
 ; CHECK:       [[WHILE_COND]]:
-; CHECK-NEXT:    [[SRC_PN:%.*]] = phi ptr [ poison, %[[WHILE_COND]] ], [ [[SRC]], %[[WHILE_COND_PREHEADER]] ]
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 poison, 0
-; CHECK-NEXT:    br i1 true, label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK-NEXT:    [[SRC_PN:%.*]] = phi ptr [ [[CURR_0:%.*]], %[[WHILE_COND]] ], [ [[SRC]], %[[WHILE_COND_PREHEADER]] ]
+; CHECK-NEXT:    [[CURR_0]] = getelementptr inbounds i8, ptr [[SRC_PN]], i64 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[CURR_0]], align 4
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_COND]]
 ; CHECK:       [[WHILE_END]]:
 ; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[END]] to i64
 ; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[SRC]] to i64
@@ -65,6 +69,66 @@ return:                                           ; preds = %entry, %lor.lhs.fal
   ret i64 %retval.0
 }
 
+define i64 @invalid_char_size(ptr %src) {
+; CHECK-LABEL: define i64 @invalid_char_size(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[SRC]], null
+; CHECK-NEXT:    br i1 [[CMP]], label %[[RETURN:.*]], label %[[LOR_LHS_FALSE:.*]]
+; CHECK:       [[LOR_LHS_FALSE]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i16 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[RETURN]], label %[[WHILE_COND_PREHEADER:.*]]
+; CHECK:       [[WHILE_COND_PREHEADER]]:
+; CHECK-NEXT:    br label %[[WHILE_COND:.*]]
+; CHECK:       [[WHILE_COND]]:
+; CHECK-NEXT:    [[SRC_PN:%.*]] = phi ptr [ [[CURR_0:%.*]], %[[WHILE_COND]] ], [ [[SRC]], %[[WHILE_COND_PREHEADER]] ]
+; CHECK-NEXT:    [[CURR_0]] = getelementptr inbounds i8, ptr [[SRC_PN]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[CURR_0]], align 2
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i16 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label %[[WHILE_END:.*]], label %[[WHILE_COND]]
+; CHECK:       [[WHILE_END]]:
+; CHECK-NEXT:    [[CURR_0_LCSSA:%.*]] = phi ptr [ [[CURR_0]], %[[WHILE_COND]] ]
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[CURR_0_LCSSA]] to i64
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i64 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    [[SUB_PTR_DIV:%.*]] = ashr exact i64 [[SUB_PTR_SUB]], 1
+; CHECK-NEXT:    br label %[[RETURN]]
+; CHECK:       [[RETURN]]:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i64 [ [[SUB_PTR_DIV]], %[[WHILE_END]] ], [ 0, %[[LOR_LHS_FALSE]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL_0]]
+;
+entry:
+  %cmp = icmp eq ptr %src, null
+  br i1 %cmp, label %return, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %0 = load i16, ptr %src, align 2
+  %cmp1 = icmp eq i16 %0, 0
+  br i1 %cmp1, label %return, label %while.cond.preheader
+
+while.cond.preheader:                             ; preds = %lor.lhs.false
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond.preheader, %while.cond
+  %src.pn = phi ptr [ %curr.0, %while.cond ], [ %src, %while.cond.preheader ]
+  %curr.0 = getelementptr inbounds i8, ptr %src.pn, i64 2
+  %1 = load i16, ptr %curr.0, align 2
+  %tobool.not = icmp eq i16 %1, 0
+  br i1 %tobool.not, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  %curr.0.lcssa = phi ptr [ %curr.0, %while.cond ]
+  %sub.ptr.lhs.cast = ptrtoint ptr %curr.0.lcssa to i64
+  %sub.ptr.rhs.cast = ptrtoint ptr %src to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %sub.ptr.div = ashr exact i64 %sub.ptr.sub, 1
+  br label %return
+
+return:                                           ; preds = %entry, %lor.lhs.false, %while.end
+  %retval.0 = phi i64 [ %sub.ptr.div, %while.end ], [ 0, %lor.lhs.false ], [ 0, %entry ]
+  ret i64 %retval.0
+}
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"wchar_size", i32 4}
 

>From 65c3ad54a705ad99ac42153cca8177af856d3b89 Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Sun, 2 Feb 2025 15:11:33 -0500
Subject: [PATCH 10/13] improve comments

---
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 66 +++++++++++--------
 1 file changed, 40 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 47e34427be4077..831df02080ddfc 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1510,6 +1510,16 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
          recognizeShiftUntilLessThan() || recognizeAndInsertStrLen();
 }
 
+/// Check if a Value is either a nullptr or a constant int zero
+static bool isZeroConstant(const Value *Val) {
+  if (isa<ConstantPointerNull>(Val))
+    return true;
+  const ConstantInt *CmpZero = dyn_cast<ConstantInt>(Val);
+  if (!CmpZero || !CmpZero->isZero())
+    return false;
+  return true;
+}
+
 /// Check if the given conditional branch is based on the comparison between
 /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
 /// true), the control yields to the loop entry. If the branch matches the
@@ -1525,11 +1535,8 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
   if (!Cond)
     return nullptr;
 
-  if (!isa<ConstantPointerNull>(Cond->getOperand(1))) {
-    ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
-    if (!CmpZero || !CmpZero->isZero())
-      return nullptr;
-  }
+  if (!isZeroConstant(Cond->getOperand(1)))
+    return nullptr;
 
   BasicBlock *TrueSucc = BI->getSuccessor(0);
   BasicBlock *FalseSucc = BI->getSuccessor(1);
@@ -1582,7 +1589,7 @@ class StrlenVerifier {
     if (!LoopCond)
       return false;
 
-    auto *LoopLoad = dyn_cast<LoadInst>(LoopCond);
+    LoadInst *LoopLoad = dyn_cast<LoadInst>(LoopCond);
     if (!LoopLoad || LoopLoad->getPointerAddressSpace() != 0)
       return false;
 
@@ -1628,7 +1635,7 @@ class StrlenVerifier {
         return false;
 
     // Scan every instruction in the loop to ensure there are no side effects.
-    for (auto &I : *LoopBody)
+    for (Instruction &I : *LoopBody)
       if (I.mayHaveSideEffects())
         return false;
 
@@ -1680,27 +1687,30 @@ class StrlenVerifier {
 
 } // namespace
 
-/// Recognizes a strlen idiom by checking for loops that increment
-/// a char pointer and then subtract with the base pointer.
+/// The Strlen Idiom we are trying to detect has the following structure
 ///
-/// If detected, transforms the relevant code to a strlen function
-/// call, and returns true; otherwise, returns false.
+/// preheader:
+///   ...
+///   br label %body, ...
 ///
-/// The core idiom we are trying to detect is:
-/// \code
-///     start = str;
-///     do {
-///       str++;
-///     } while(*str != '\0');
-/// \endcode
+/// body:
+///   ... ; %0 is incremented by a gep
+///   %1 = load i8, ptr %0, align 1
+///   %2 = icmp eq i8 %1, 0
+///   br i1 %2, label %exit, label %body
 ///
-/// The transformed output is similar to below c-code:
-/// \code
-///     str = start + strlen(start)
-///     len = str - start
-/// \endcode
+/// exit:
+///   %lcssa = phi [%0, %body], ...
+///
+/// We expect the strlen idiom to have a load of a character type that
+/// is compared against '\0', and such load pointer operand must have scev
+/// expression of the form {%str,+,c} where c is a ConstantInt of the
+/// appropiate character width for the idiom, and %str is the base of the string
+/// And, that all lcssa phis have the form {...,+,n} where n is a constant,
 ///
-/// Later the pointer subtraction will be folded by InstCombine
+/// When transforming the output of the strlen idiom, the lccsa phi are
+/// expanded using SCEVExpander as {base scev,+,a} -> (base scev + a * strlen)
+/// and all subsequent uses are replaced.
 bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   if (DisableLIRP::All)
     return false;
@@ -1735,6 +1745,10 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   const SCEV *StrlenEv = SE->getSCEV(StrLenFunc);
   SmallVector<PHINode *, 4> Cleanup;
   for (PHINode &PN : LoopExitBB->phis()) {
+    // We can now materialize the loop output as all phi have scev {base,+,a}.
+    // We expand the phi as:
+    //   %strlen = call i64 @strlen(%str)
+    //   %phi.new = base expression + step * %strlen
     const SCEV *Ev = SE->getSCEV(&PN);
     const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Ev);
     const SCEVConstant *Step =
@@ -1754,7 +1768,7 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
     Cleanup.push_back(&PN);
   }
 
-  // All LCSSA Loop Phi are dead, the left over dead loop body can be cleaned 
+  // All LCSSA Loop Phi are dead, the left over dead loop body can be cleaned
   // up by later passes
   for (PHINode *PN : Cleanup) {
     RecursivelyDeleteDeadPHINode(PN);
@@ -1766,7 +1780,7 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   ORE.emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "recognizeAndInsertStrLen",
                               CurLoop->getStartLoc(), Preheader)
-           << "Transformed strlen loop idiom";
+           << "Transformed " << StrLenFunc->getName() << " loop idiom";
   });
 
   return true;

>From a7b1ce868cfc41ef1859f29071bced04665dded9 Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Sun, 2 Feb 2025 15:20:40 -0500
Subject: [PATCH 11/13] revert formatting

---
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 33 ++++++++++---------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 831df02080ddfc..d1154ef7e6aa1c 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -252,7 +252,7 @@ class LoopIdiomRecognize {
   bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
                              Instruction *DefX, PHINode *CntPhi,
                              Instruction *CntInst);
-  bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
+  bool recognizeAndInsertFFS();  /// Find First Set: ctlz or cttz
   bool recognizeShiftUntilLessThan();
   void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
                                 Instruction *CntInst, PHINode *CntPhi,
@@ -620,8 +620,7 @@ bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
     const SCEVAddRecExpr *FirstStoreEv =
         cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
     APInt FirstStride = getStoreStride(FirstStoreEv);
-    unsigned FirstStoreSize =
-        DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
+    unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
 
     // See if we can optimize just this store in isolation.
     if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
@@ -1112,14 +1111,13 @@ bool LoopIdiomRecognize::processLoopStridedStore(
         BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
         /*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
   } else {
-    assert(isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
+    assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
     // Everything is emitted in default address space
     Type *Int8PtrTy = DestInt8PtrTy;
 
     StringRef FuncName = "memset_pattern16";
-    FunctionCallee MSP =
-        getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
-                           Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
+    FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
+                            Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
     inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
@@ -1161,7 +1159,8 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     R << "Transformed loop-strided store in "
       << ore::NV("Function", TheStore->getFunction())
       << " function into a call to "
-      << ore::NV("NewFunction", NewCall->getCalledFunction()) << "() intrinsic";
+      << ore::NV("NewFunction", NewCall->getCalledFunction())
+      << "() intrinsic";
     if (!Stores.empty())
       R << ore::setExtraArgs();
     for (auto *I : Stores) {
@@ -1462,7 +1461,8 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
            << ore::NV("NewFunction", NewCall->getCalledFunction())
            << "() intrinsic from " << ore::NV("Inst", InstRemark)
            << " instruction in " << ore::NV("Function", TheStore->getFunction())
-           << " function" << ore::setExtraArgs()
+           << " function"
+           << ore::setExtraArgs()
            << ore::NV("FromBlock", TheStore->getParent()->getName())
            << ore::NV("ToBlock", Preheader->getName());
   });
@@ -1993,7 +1993,8 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
     ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
     if (!Dec ||
         !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
-          (SubOneOp->getOpcode() == Instruction::Add && Dec->isMinusOne()))) {
+          (SubOneOp->getOpcode() == Instruction::Add &&
+           Dec->isMinusOne()))) {
       return false;
     }
   }
@@ -2104,8 +2105,8 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
   // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
   if (!DefX || !DefX->isShift())
     return false;
-  IntrinID =
-      DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz : Intrinsic::ctlz;
+  IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
+                                                     Intrinsic::ctlz;
   ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
   if (!Shft || !Shft->isOne())
     return false;
@@ -2608,8 +2609,9 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
     TcPhi->insertBefore(Body->begin());
 
     Builder.SetInsertPoint(LbCond);
-    Instruction *TcDec = cast<Instruction>(Builder.CreateSub(
-        TcPhi, ConstantInt::get(Ty, 1), "tcdec", false, true));
+    Instruction *TcDec = cast<Instruction>(
+        Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+                          "tcdec", false, true));
 
     TcPhi->addIncoming(TripCnt, PreHead);
     TcPhi->addIncoming(TcDec, Body);
@@ -3239,8 +3241,7 @@ bool LoopIdiomRecognize::recognizeShiftUntilZero() {
   // intrinsic we'll use are not cheap. Note that we are okay with *just*
   // making the loop countable, even if nothing else changes.
   IntrinsicCostAttributes Attrs(
-      IntrID, Ty,
-      {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
+      IntrID, Ty, {PoisonValue::get(Ty), /*is_zero_poison=*/Builder.getFalse()});
   InstructionCost Cost = TTI->getIntrinsicInstrCost(Attrs, CostKind);
   if (Cost > TargetTransformInfo::TCC_Basic) {
     LLVM_DEBUG(dbgs() << DEBUG_TYPE

>From cf3f2618fdce15b63fb11714c887d7a5e090a06c Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Sun, 2 Feb 2025 15:25:21 -0500
Subject: [PATCH 12/13] remove DEBUG_TYPE from disable flags

---
 llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index d1154ef7e6aa1c..fe905e58d9b04f 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -129,7 +129,7 @@ static cl::opt<bool, true>
 
 bool DisableLIRP::Strlen;
 static cl::opt<bool, true>
-    DisableLIRPStrlen("disable-" DEBUG_TYPE "-strlen",
+    DisableLIRPStrlen("disable-loop-idiom-strlen",
                       cl::desc("Proceed with loop idiom recognize pass, but do "
                                "not convert loop(s) to strlen."),
                       cl::location(DisableLIRP::Strlen), cl::init(false),
@@ -137,7 +137,7 @@ static cl::opt<bool, true>
 
 bool DisableLIRP::Wcslen;
 static cl::opt<bool, true>
-    DisableLIRPWcslen("disable-" DEBUG_TYPE "-wcslen",
+    DisableLIRPWcslen("disable-loop-idiom-wcslen",
                       cl::desc("Proceed with loop idiom recognize pass, but do "
                                "not convert loop(s) to wcslen."),
                       cl::location(DisableLIRP::Wcslen), cl::init(false),

>From 2619e67f8e9921b6539eab05ce550e8fd1bcfdc4 Mon Sep 17 00:00:00 2001
From: Henry Jiang <henry.jiang1 at ibm.com>
Date: Sun, 2 Feb 2025 19:00:02 -0500
Subject: [PATCH 13/13] add more documentation

---
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 37 ++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index fe905e58d9b04f..cd61c513df3cde 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1710,7 +1710,42 @@ class StrlenVerifier {
 ///
 /// When transforming the output of the strlen idiom, the lccsa phi are
 /// expanded using SCEVExpander as {base scev,+,a} -> (base scev + a * strlen)
-/// and all subsequent uses are replaced.
+/// and all subsequent uses are replaced. For example,
+///
+/// \code{.c}
+///     const char* base = str;
+///     while (*str != '\0')
+///         ++str;
+///     size_t result = str - base;
+/// \endcode
+///
+/// will be transformed as as follow: The idiom will be replaced by a strlen
+/// computation to compute the address of the null terminator of the string.
+///
+/// \code{.c}
+///     const char* base = str;
+///     const char* end = base + strlen(str);
+///     size_t result = end - base;
+/// \endcode
+///
+/// In the case we index by an induction variable, as long as the induction
+/// variable has a constant int increment, we can replace all such indvars
+/// with the closed form computation of strlen
+///
+/// \code{.c}
+///     size_t i = 0;
+///     while (str[i] != '\0')
+///         ++i;
+///     size_t result = i;
+/// \endcode
+///
+/// Will be replaced by
+///
+/// \code{.c}
+///     size_t i = 0 + strlen(str);
+///     size_t result = i;
+/// \endcode
+///
 bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   if (DisableLIRP::All)
     return false;



More information about the llvm-commits mailing list