[llvm] [LoopIdiom] Reland: Support 'shift until less-than' idiom #95002 (PR #98298)

Hari Limaye via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 10 06:55:24 PDT 2024


https://github.com/hazzlim updated https://github.com/llvm/llvm-project/pull/98298

>From 1a4e312265200caec9eb8d0eaf8bc0798ff63977 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Tue, 4 Jun 2024 15:08:51 +0000
Subject: [PATCH 1/2] [LoopIdiom] Support 'shift until less-than' idiom

The current loop idiom code for recognising and inserting a CTLZ
intrinsic does not support loops where the loopback control is based on
an unsigned less-than condition. This patch adds support for recognising
these loops and inserting a CTLZ intrinsic.

Fixes the missed optimization cases in #51064

Co-authored-by: David Sherwood <david.sherwood at arm.com>
---
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 291 ++++++-
 .../test/Transforms/LoopIdiom/AArch64/ctlz.ll | 778 ++++++++++++++++++
 2 files changed, 1033 insertions(+), 36 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 635bd1236196e..b3e3e1e71f101 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -231,12 +231,19 @@ class LoopIdiomRecognize {
   bool recognizePopcount();
   void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
                                PHINode *CntPhi, Value *Var);
+  bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
+                               bool ZeroCheck, size_t CanonicalSize);
+  bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
+                             Instruction *DefX, PHINode *CntPhi,
+                             Instruction *CntInst);
   bool recognizeAndInsertFFS();  /// Find First Set: ctlz or cttz
+  bool recognizeShiftUntilLessThan();
   void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
                                 Instruction *CntInst, PHINode *CntPhi,
                                 Value *Var, Instruction *DefX,
                                 const DebugLoc &DL, bool ZeroCheck,
-                                bool IsCntPhiUsedOutsideLoop);
+                                bool IsCntPhiUsedOutsideLoop,
+                                bool InsertSub = false);
 
   bool recognizeShiftUntilBitTest();
   bool recognizeShiftUntilZero();
@@ -1482,7 +1489,8 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
                     << CurLoop->getHeader()->getName() << "\n");
 
   return recognizePopcount() || recognizeAndInsertFFS() ||
-         recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
+         recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||
+         recognizeShiftUntilLessThan();
 }
 
 /// Check if the given conditional branch is based on the comparison between
@@ -1517,6 +1525,34 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
   return nullptr;
 }
 
+/// Check if the given conditional branch is based on an unsigned less-than
+/// comparison between a variable and a constant, and if the comparison is false
+/// the control yields to the loop entry. If the branch matches the behaviour,
+/// the variable involved in the comparison is returned.
+static Value *matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry,
+                                     uint64_t &Threshold) {
+  if (!BI || !BI->isConditional())
+    return nullptr;
+
+  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return nullptr;
+
+  ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1));
+  if (!CmpConst)
+    return nullptr;
+
+  BasicBlock *FalseSucc = BI->getSuccessor(1);
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+
+  if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
+    Threshold = CmpConst->getZExtValue();
+    return Cond->getOperand(0);
+  }
+
+  return nullptr;
+}
+
 // Check if the recurrence variable `VarX` is in the right form to create
 // the idiom. Returns the value coerced to a PHINode if so.
 static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
@@ -1528,6 +1564,107 @@ static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
   return nullptr;
 }
 
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+///       or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+///       or nullptr if there is no such.
+/// 3) \p InitX is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+/// 5) \p Threshold is set to the constant involved in the unsigned less-than
+///       comparison.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///    if (x0 < 2)
+///      goto loop-exit // the precondition of the loop
+///    cnt0 = init-val
+///    do {
+///      x = phi (x0, x.next);   //PhiX
+///      cnt = phi (cnt0, cnt.next)
+///
+///      cnt.next = cnt + 1;
+///       ...
+///      x.next = x >> 1;   // DefX
+///    } while (x >= 4)
+/// loop-exit:
+/// \endcode
+static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL,
+                                          Intrinsic::ID &IntrinID,
+                                          Value *&InitX, Instruction *&CntInst,
+                                          PHINode *&CntPhi, Instruction *&DefX,
+                                          uint64_t &Threshold) {
+  BasicBlock *LoopEntry;
+
+  DefX = nullptr;
+  CntInst = nullptr;
+  CntPhi = nullptr;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  if (Value *T = matchShiftULTCondition(
+          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry,
+          Threshold))
+    DefX = dyn_cast<Instruction>(T);
+  else
+    return false;
+
+  // step 2: Check the recurrence of variable X
+  if (!DefX || !isa<PHINode>(DefX))
+    return false;
+
+  PHINode *VarPhi = cast<PHINode>(DefX);
+  int Idx = VarPhi->getBasicBlockIndex(LoopEntry);
+  if (Idx == -1)
+    return false;
+
+  DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx));
+  if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi)
+    return false;
+
+  // step 3: detect instructions corresponding to "x.next = x >> 1"
+  if (DefX->getOpcode() != Instruction::LShr)
+    return false;
+
+  IntrinID = Intrinsic::ctlz;
+  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
+  if (!Shft || !Shft->isOne())
+    return false;
+
+  InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader());
+
+  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+  //         or cnt.next = cnt + -1.
+  // TODO: We can skip the step. If loop trip count is known (CTLZ),
+  //       then all uses of "cnt.next" could be optimized to the trip count
+  //       plus "cnt0". Currently it is not optimized.
+  //       This step could be used to detect POPCNT instruction:
+  //       cnt.next = cnt + (x.next & 1)
+  for (Instruction &Inst : llvm::make_range(
+           LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
+    if (Inst.getOpcode() != Instruction::Add)
+      continue;
+
+    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
+    if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
+      continue;
+
+    PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
+    if (!Phi)
+      continue;
+
+    CntInst = &Inst;
+    CntPhi = Phi;
+    break;
+  }
+  if (!CntInst)
+    return false;
+
+  return true;
+}
+
 /// Return true iff the idiom is detected in the loop.
 ///
 /// Additionally:
@@ -1756,27 +1893,35 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
   return true;
 }
 
-/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
-/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
-/// trip count returns true; otherwise, returns false.
-bool LoopIdiomRecognize::recognizeAndInsertFFS() {
-  // Give up if the loop has multiple blocks or multiple backedges.
-  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
-    return false;
+// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
+// profitable if we delete the loop.
+bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
+                                                 Value *InitX, bool ZeroCheck,
+                                                 size_t CanonicalSize) {
+  const Value *Args[] = {InitX,
+                         ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
 
-  Intrinsic::ID IntrinID;
-  Value *InitX;
-  Instruction *DefX = nullptr;
-  PHINode *CntPhi = nullptr;
-  Instruction *CntInst = nullptr;
-  // Help decide if transformation is profitable. For ShiftUntilZero idiom,
-  // this is always 6.
-  size_t IdiomCanonicalSize = 6;
+  // @llvm.dbg doesn't count as they have no semantic effect.
+  auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
+  uint32_t HeaderSize =
+      std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
 
-  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
-                                 CntInst, CntPhi, DefX))
+  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
+  InstructionCost Cost = TTI->getIntrinsicInstrCost(
+      Attrs, TargetTransformInfo::TCK_SizeAndLatency);
+  if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
     return false;
 
+  return true;
+}
+
+/// Convert CTLZ / CTTZ idiom loop into countable loop.
+/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
+/// returns false.
+bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
+                                               Value *InitX, Instruction *DefX,
+                                               PHINode *CntPhi,
+                                               Instruction *CntInst) {
   bool IsCntPhiUsedOutsideLoop = false;
   for (User *U : CntPhi->users())
     if (!CurLoop->contains(cast<Instruction>(U))) {
@@ -1818,35 +1963,107 @@ bool LoopIdiomRecognize::recognizeAndInsertFFS() {
     ZeroCheck = true;
   }
 
-  // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
-  // profitable if we delete the loop.
-
-  // the loop has only 6 instructions:
+  // FFS idiom loop has only 6 instructions:
   //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
   //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
   //  %shr = ashr %n.addr.0, 1
   //  %tobool = icmp eq %shr, 0
   //  %inc = add nsw %i.0, 1
   //  br i1 %tobool
+  size_t IdiomCanonicalSize = 6;
+  if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
+    return false;
 
-  const Value *Args[] = {InitX,
-                         ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
+  transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
+                           DefX->getDebugLoc(), ZeroCheck,
+                           IsCntPhiUsedOutsideLoop);
+  return true;
+}
 
-  // @llvm.dbg doesn't count as they have no semantic effect.
-  auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
-  uint32_t HeaderSize =
-      std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
+/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
+/// trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertFFS() {
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
 
-  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
-  InstructionCost Cost =
-    TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
-  if (HeaderSize != IdiomCanonicalSize &&
-      Cost > TargetTransformInfo::TCC_Basic)
+  Intrinsic::ID IntrinID;
+  Value *InitX;
+  Instruction *DefX = nullptr;
+  PHINode *CntPhi = nullptr;
+  Instruction *CntInst = nullptr;
+
+  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi,
+                                 DefX))
+    return false;
+
+  return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
+}
+
+bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  Intrinsic::ID IntrinID;
+  Value *InitX;
+  Instruction *DefX = nullptr;
+  PHINode *CntPhi = nullptr;
+  Instruction *CntInst = nullptr;
+
+  uint64_t LoopThreshold;
+  if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst,
+                                     CntPhi, DefX, LoopThreshold))
+    return false;
+
+  if (LoopThreshold == 2) {
+    // Treat as regular FFS.
+    return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
+  }
+
+  // Look for Floor Log2 Idiom.
+  if (LoopThreshold != 4)
+    return false;
+
+  // Abort if CntPhi is used outside of the loop.
+  for (User *U : CntPhi->users())
+    if (!CurLoop->contains(cast<Instruction>(U)))
+      return false;
+
+  // It is safe to assume Preheader exist as it was checked in
+  // parent function RunOnLoop.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+  auto *PreCondBB = PH->getSinglePredecessor();
+  if (!PreCondBB)
+    return false;
+  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+  if (!PreCondBI)
+    return false;
+
+  uint64_t PreLoopThreshold;
+  if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX ||
+      PreLoopThreshold != 2)
     return false;
 
+  bool ZeroCheck = true;
+
+  // the loop has only 6 instructions:
+  //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+  //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+  //  %shr = ashr %n.addr.0, 1
+  //  %tobool = icmp ult %n.addr.0, C
+  //  %inc = add nsw %i.0, 1
+  //  br i1 %tobool
+  size_t IdiomCanonicalSize = 6;
+  if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
+    return false;
+
+  // log2(x) = w − 1 − clz(x)
   transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
                            DefX->getDebugLoc(), ZeroCheck,
-                           IsCntPhiUsedOutsideLoop);
+                           /*IsCntPhiUsedOutsideLoop=*/false,
+                           /*InsertSub=*/true);
   return true;
 }
 
@@ -1961,7 +2178,7 @@ static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
 void LoopIdiomRecognize::transformLoopToCountable(
     Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
     PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
-    bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+    bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
   BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
 
   // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
@@ -1991,6 +2208,8 @@ void LoopIdiomRecognize::transformLoopToCountable(
   Type *CountTy = Count->getType();
   Count = Builder.CreateSub(
       ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
+  if (InsertSub)
+    Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1));
   Value *NewCount = Count;
   if (IsCntPhiUsedOutsideLoop)
     Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll b/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll
new file mode 100644
index 0000000000000..47ae4fd5b66a7
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll
@@ -0,0 +1,778 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-idiom -mtriple=aarch64 < %s -S | FileCheck %s
+
+; Recognize CTLZ builtin pattern.
+; Here we'll just convert loop to countable,
+; so do not insert builtin if CPU do not support CTLZ
+;
+; int ctlz_and_other(int n, char *a)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0, n0 = n;
+;   while(n >>= 1) {
+;     a[i] = (n0 & (1 << i)) ? 1 : 0;
+;     i++;
+;   }
+;   return i;
+; }
+;
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @ctlz_and_other(i32 %n, ptr nocapture %a) {
+; CHECK-LABEL: define i32 @ctlz_and_other(
+; CHECK-SAME: i32 [[N:%.*]], ptr nocapture [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[SHR8:%.*]] = lshr i32 [[ABS_N]], 1
+; CHECK-NEXT:    [[TOBOOL9:%.*]] = icmp eq i32 [[SHR8]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL9]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[SHR8]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 32, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR11:%.*]] = phi i32 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[SHR8]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 1, [[TMP3]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SHL]], [[ABS_N]]
+; CHECK-NEXT:    [[TOBOOL1:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL1]] to i8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[CONV]], ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[SHR11]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_LCSSA:%.*]] = phi i64 [ [[TMP2]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV_NEXT_LCSSA]] to i32
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
+  %shr8 = lshr i32 %abs_n, 1
+  %tobool9 = icmp eq i32 %shr8, 0
+  br i1 %tobool9, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ]
+  %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ]
+  %0 = trunc i64 %indvars.iv to i32
+  %shl = shl i32 1, %0
+  %and = and i32 %shl, %abs_n
+  %tobool1 = icmp ne i32 %and, 0
+  %conv = zext i1 %tobool1 to i8
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  store i8 %conv, ptr %arrayidx, align 1
+  %indvars.iv.next = add nuw i64 %indvars.iv, 1
+  %shr = ashr i32 %shr11, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %1 = trunc i64 %indvars.iv.next to i32
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_zero_check(int n)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0;
+;   while(n) {
+;     n >>= 1;
+;     i++;
+;   }
+;   return i;
+; }
+;
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_zero_check(i32 %n) {
+; CHECK-LABEL: define i32 @ctlz_zero_check(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[TOBOOL4:%.*]] = icmp eq i32 [[ABS_N]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[ABS_N]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 32, [[TMP0]]
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[N_ADDR_05:%.*]] = phi i32 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[ABS_N]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[N_ADDR_05]], 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_06]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
+  %tobool4 = icmp eq i32 %abs_n, 0
+  br i1 %tobool4, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.05 = phi i32 [ %shr, %while.body ], [ %abs_n, %while.body.preheader ]
+  %shr = ashr i32 %n.addr.05, 1
+  %inc = add nsw i32 %i.06, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz(int n)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz(i32 %n) {
+; CHECK-LABEL: define i32 @ctlz(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[ABS_N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP2]], [[WHILE_COND]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; This test covers how instcombine may optimise the previous ctlz case.
+;
+; int ctlz(int n)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+
+define i32 @ctlz_fold(i32 noundef %n) {
+; CHECK-LABEL: define i32 @ctlz_fold(
+; CHECK-SAME: i32 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = tail call i32 @llvm.abs.i32(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[TOBOOL_NOT5:%.*]] = icmp ult i32 [[COND]], 2
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT5]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], 1
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[N_ADDR_06:%.*]] = phi i32 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[COND]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR]] = lshr i32 [[N_ADDR_06]], 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %cond = tail call i32 @llvm.abs.i32(i32 %n, i1 true)
+  %tobool.not5 = icmp ult i32 %cond, 2
+  br i1 %tobool.not5, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %i.07 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.06 = phi i32 [ %shr, %while.body ], [ %cond, %while.body.preheader ]
+  %shr = lshr i32 %n.addr.06, 1
+  %inc = add nuw nsw i32 %i.07, 1
+  %tobool.not = icmp ult i32 %n.addr.06, 4
+  br i1 %tobool.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %inc.lcssa = phi i32 [ %inc, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_add(int n, int i0)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = i0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_add(i32 %n, i32 %i0) {
+; CHECK-LABEL: define i32 @ctlz_add(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[I0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[I0]]
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[ABS_N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ [[I0]], [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP4]], [[WHILE_COND]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_sub(int n, int i0)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = i0;
+;   while(n >>= 1) {
+;     i--;
+;   }
+;   return i;
+; }
+;
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_sub(i32 %n, i32 %i0) {
+; CHECK-LABEL: define i32 @ctlz_sub(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[I0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[I0]], [[TMP2]]
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[ABS_N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ [[I0]], [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], -1
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP4]], [[WHILE_COND]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, -1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_sext(short in)
+; {
+;   int n = in;
+;   if (in < 0)
+;     n = -n;
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_sext(i16 %in) {
+; CHECK-LABEL: define i32 @ctlz_sext(
+; CHECK-SAME: i16 [[IN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = call i16 @llvm.abs.i16(i16 [[IN]], i1 false)
+; CHECK-NEXT:    [[ABS_N:%.*]] = zext i16 [[ABS]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[ABS_N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP2]], [[WHILE_COND]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %abs = call i16 @llvm.abs.i16(i16 %in, i1 false)
+  %abs_n = zext i16 %abs to i32
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+
+; unsigned floor_log2(unsigned long n) {
+;   unsigned result = 0;
+;   while (n >>= 1) result++;
+;   return result;
+; }
+
+define i32 @floor_log2_use_inc(i64 noundef %n) {
+; CHECK-LABEL: define i32 @floor_log2_use_inc(
+; CHECK-SAME: i64 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL_NOT2:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[N]], i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 64, [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i64 [ [[TMP4]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[RESULT_04:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[N_ADDR_03:%.*]] = phi i64 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR]] = lshr i64 [[N_ADDR_03]], 1
+; CHECK-NEXT:    [[INC]] = add i32 [[RESULT_04]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i64 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  %tobool.not2 = icmp ult i64 %n, 2
+  br i1 %tobool.not2, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %result.04 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.03 = phi i64 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+  %shr = lshr i64 %n.addr.03, 1
+  %inc = add i32 %result.04, 1
+  %tobool.not = icmp ult i64 %n.addr.03, 4
+  br i1 %tobool.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  %inc.lcssa = phi i32 [ %inc, %while.body ]
+  br label %while.end
+
+while.end:
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+  ret i32 %result.0.lcssa
+}
+
+
+define i32 @floor_log2_use_phi(i64 noundef %n) {
+; CHECK-LABEL: define i32 @floor_log2_use_phi(
+; CHECK-SAME: i64 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL_NOT2:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[RESULT_04:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[N_ADDR_03:%.*]] = phi i64 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR]] = lshr i64 [[N_ADDR_03]], 1
+; CHECK-NEXT:    [[INC]] = add i32 [[RESULT_04]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ult i64 [[N_ADDR_03]], 4
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[RESULT_04]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  %tobool.not2 = icmp ult i64 %n, 2
+  br i1 %tobool.not2, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %result.04 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.03 = phi i64 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+  %shr = lshr i64 %n.addr.03, 1
+  %inc = add i32 %result.04, 1
+  %tobool.not = icmp ult i64 %n.addr.03, 4
+  br i1 %tobool.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  %inc.lcssa = phi i32 [ %result.04, %while.body ]
+  br label %while.end
+
+while.end:
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+  ret i32 %result.0.lcssa
+}
+
+
+; unsigned floor_log2_dec(unsigned long n) {
+;   unsigned result = 0;
+;   while (n >>= 1) result--;
+;   return result;
+; }
+
+define i32 @floor_log2_dec(i64 noundef %n) {
+; CHECK-LABEL: define i32 @floor_log2_dec(
+; CHECK-SAME: i64 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL_NOT2:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[N]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 64, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 0, [[TMP3]]
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i64 [ [[TMP2]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[RESULT_04:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[N_ADDR_03:%.*]] = phi i64 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR]] = lshr i64 [[N_ADDR_03]], 1
+; CHECK-NEXT:    [[INC]] = add i32 [[RESULT_04]], -1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i64 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[TMP4]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  %tobool.not2 = icmp ult i64 %n, 2
+  br i1 %tobool.not2, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %result.04 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.03 = phi i64 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+  %shr = lshr i64 %n.addr.03, 1
+  %inc = add i32 %result.04, -1
+  %tobool.not = icmp ult i64 %n.addr.03, 4
+  br i1 %tobool.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  %inc.lcssa = phi i32 [ %inc, %while.body ]
+  br label %while.end
+
+while.end:
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+  ret i32 %result.0.lcssa
+}
+
+
+; unsigned int_log2_rec(unsigned x) {
+;   return x == 0 ? 0 : int_log2_rec(x >> 1) + 1;
+; }
+
+define i32 @int_log2_rec(i32 noundef %x) {
+; CHECK-LABEL: define i32 @int_log2_rec(
+; CHECK-SAME: i32 noundef [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[CMP2]], label [[COND_END:%.*]], label [[COND_FALSE_PREHEADER:%.*]]
+; CHECK:       cond.false.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[X]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 32, [[TMP0]]
+; CHECK-NEXT:    br label [[COND_FALSE:%.*]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[COND_FALSE_PREHEADER]] ], [ [[TCDEC:%.*]], [[COND_FALSE]] ]
+; CHECK-NEXT:    [[X_TR4:%.*]] = phi i32 [ [[SHR:%.*]], [[COND_FALSE]] ], [ [[X]], [[COND_FALSE_PREHEADER]] ]
+; CHECK-NEXT:    [[ACCUMULATOR_TR3:%.*]] = phi i32 [ [[ADD:%.*]], [[COND_FALSE]] ], [ 0, [[COND_FALSE_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR]] = lshr i32 [[X_TR4]], 1
+; CHECK-NEXT:    [[ADD]] = add i32 [[ACCUMULATOR_TR3]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END_LOOPEXIT:%.*]], label [[COND_FALSE]]
+; CHECK:       cond.end.loopexit:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP1]], [[COND_FALSE]] ]
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    [[ACCUMULATOR_TR_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[COND_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[ACCUMULATOR_TR_LCSSA]]
+;
+entry:
+  %cmp2 = icmp eq i32 %x, 0
+  br i1 %cmp2, label %cond.end, label %cond.false.preheader
+
+cond.false.preheader:                             ; preds = %entry
+  br label %cond.false
+
+cond.false:                                       ; preds = %cond.false.preheader, %cond.false
+  %x.tr4 = phi i32 [ %shr, %cond.false ], [ %x, %cond.false.preheader ]
+  %accumulator.tr3 = phi i32 [ %add, %cond.false ], [ 0, %cond.false.preheader ]
+  %shr = lshr i32 %x.tr4, 1
+  %add = add i32 %accumulator.tr3, 1
+  %cmp = icmp ult i32 %x.tr4, 2
+  br i1 %cmp, label %cond.end.loopexit, label %cond.false
+
+cond.end.loopexit:                                ; preds = %cond.false
+  %add.lcssa = phi i32 [ %add, %cond.false ]
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.end.loopexit, %entry
+  %accumulator.tr.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %cond.end.loopexit ]
+  ret i32 %accumulator.tr.lcssa
+}
+
+
+; We can't easily transform this loop. It returns 1 for an input of both
+; 0 and 1.
+; int ctlz_do_while_use_inc(unsigned n)
+; {
+;   int i = 0;
+;   do {
+;     i++;
+;     n >>= 1;
+;   } while(n != 0);
+;   return i;
+; }
+
+define i32 @ctlz_do_while_use_inc(i32 noundef %n) {
+; CHECK-LABEL: define i32 @ctlz_do_while_use_inc(
+; CHECK-SAME: i32 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[DO_BODY:%.*]]
+; CHECK:       do.body:
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[SHR:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_0]], 1
+; CHECK-NEXT:    [[SHR]] = lshr i32 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp ult i32 [[N_ADDR_0]], 2
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[DO_END:%.*]], label [[DO_BODY]]
+; CHECK:       do.end:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[DO_BODY]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %do.body ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %do.body ]
+  %inc = add nuw nsw i32 %i.0, 1
+  %shr = lshr i32 %n.addr.0, 1
+  %cmp.not = icmp ult i32 %n.addr.0, 2
+  br i1 %cmp.not, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %inc.lcssa = phi i32 [ %inc, %do.body ]
+  ret i32 %inc.lcssa
+}
+
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_do_while_use_phi(unsigned n)
+; {
+;   int phi;
+;   int inc = 0;
+;   do {
+;     phi = inc;
+;     inc++;
+;     n >>= 1;
+;   } while(n != 0);
+;   return phi;
+; }
+
+define i32 @ctlz_do_while_use_phi(i32 noundef %n) {
+; CHECK-LABEL: define i32 @ctlz_do_while_use_phi(
+; CHECK-SAME: i32 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    br label [[DO_BODY:%.*]]
+; CHECK:       do.body:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[N]], [[ENTRY]] ], [ [[SHR:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[INC_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC1:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[INC1]] = add nuw nsw i32 [[INC_0]], 1
+; CHECK-NEXT:    [[SHR]] = lshr i32 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[DO_END:%.*]], label [[DO_BODY]]
+; CHECK:       do.end:
+; CHECK-NEXT:    [[INC_0_LCSSA:%.*]] = phi i32 [ [[TMP2]], [[DO_BODY]] ]
+; CHECK-NEXT:    ret i32 [[INC_0_LCSSA]]
+;
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %do.body ]
+  %inc.0 = phi i32 [ 0, %entry ], [ %inc1, %do.body ]
+  %inc1 = add nuw nsw i32 %inc.0, 1
+  %shr = lshr i32 %n.addr.0, 1
+  %cmp.not = icmp ult i32 %n.addr.0, 2
+  br i1 %cmp.not, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  ret i32 %inc.0
+}
+
+
+declare i32 @llvm.abs.i32(i32, i1)
+declare i16 @llvm.abs.i16(i16, i1)

>From 62318760271987a16717cc26a2f103f19f8af996 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Mon, 8 Jul 2024 18:39:48 +0000
Subject: [PATCH 2/2] Use APInt for threshold in matchShiftULTCondition

Use APInt for the threshold in matchShiftULTCondition, so that the
function works for integers of arbitrary precision (rather than just
those within the range of uint64_t).
---
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 10 +++---
 .../test/Transforms/LoopIdiom/AArch64/ctlz.ll | 31 +++++++++++++++++++
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index b3e3e1e71f101..0ee1afa76a823 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1530,7 +1530,7 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
 /// the control yields to the loop entry. If the branch matches the behaviour,
 /// the variable involved in the comparison is returned.
 static Value *matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry,
-                                     uint64_t &Threshold) {
+                                     APInt &Threshold) {
   if (!BI || !BI->isConditional())
     return nullptr;
 
@@ -1546,7 +1546,7 @@ static Value *matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry,
   ICmpInst::Predicate Pred = Cond->getPredicate();
 
   if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
-    Threshold = CmpConst->getZExtValue();
+    Threshold = CmpConst->getValue();
     return Cond->getOperand(0);
   }
 
@@ -1595,7 +1595,7 @@ static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL,
                                           Intrinsic::ID &IntrinID,
                                           Value *&InitX, Instruction *&CntInst,
                                           PHINode *&CntPhi, Instruction *&DefX,
-                                          uint64_t &Threshold) {
+                                          APInt &Threshold) {
   BasicBlock *LoopEntry;
 
   DefX = nullptr;
@@ -2012,7 +2012,7 @@ bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
   PHINode *CntPhi = nullptr;
   Instruction *CntInst = nullptr;
 
-  uint64_t LoopThreshold;
+  APInt LoopThreshold;
   if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst,
                                      CntPhi, DefX, LoopThreshold))
     return false;
@@ -2041,7 +2041,7 @@ bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
   if (!PreCondBI)
     return false;
 
-  uint64_t PreLoopThreshold;
+  APInt PreLoopThreshold;
   if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX ||
       PreLoopThreshold != 2)
     return false;
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll b/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll
index 47ae4fd5b66a7..5a182745399be 100644
--- a/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll
@@ -773,6 +773,37 @@ do.end:                                           ; preds = %do.body
   ret i32 %inc.0
 }
 
+; Check that we correctly bail on analysis when the ult comparison is with a
+; constant that exceeds the (unsigned) range of a 64-bit integer, as we currently
+; only handle loopback condition ult 2 or 4.
+
+define i128 @large_constant(i128 noundef %n) {
+; CHECK-LABEL: define i128 @large_constant(
+; CHECK-SAME: i128 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[DO_BODY:%.*]]
+; CHECK:       do.body:
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i128 [ [[N]], [[ENTRY:%.*]] ], [ [[SHR:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[SHR]] = lshr i128 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp ult i128 [[N_ADDR_0]], 18446744073709551616
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[DO_END:%.*]], label [[DO_BODY]]
+; CHECK:       do.end:
+; CHECK-NEXT:    [[SHR_LCSSA:%.*]] = phi i128 [ [[SHR]], [[DO_BODY]] ]
+; CHECK-NEXT:    ret i128 [[SHR_LCSSA]]
+;
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %n.addr.0 = phi i128 [ %n, %entry ], [ %shr, %do.body ]
+  %shr = lshr i128 %n.addr.0, 1
+  %cmp.not = icmp ult i128 %n.addr.0, 18446744073709551616
+  br i1 %cmp.not, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  ret i128 %shr
+}
+
 
 declare i32 @llvm.abs.i32(i32, i1)
 declare i16 @llvm.abs.i16(i16, i1)



More information about the llvm-commits mailing list