[llvm] 83b01aa - [LoopIdiom] Support 'shift until less-than' idiom (#95002)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 8 06:32:11 PDT 2024


Author: Hari Limaye
Date: 2024-07-08T14:32:08+01:00
New Revision: 83b01aaf51072a07261ee2e5fc14102f71273bc0

URL: https://github.com/llvm/llvm-project/commit/83b01aaf51072a07261ee2e5fc14102f71273bc0
DIFF: https://github.com/llvm/llvm-project/commit/83b01aaf51072a07261ee2e5fc14102f71273bc0.diff

LOG: [LoopIdiom] Support 'shift until less-than' idiom (#95002)

The current loop idiom code for recognising and inserting a CTLZ
intrinsic does not support loops where the loopback control is based on
an unsigned less-than condition. This patch adds support for recognising
these loops and inserting a CTLZ intrinsic.

Fixes the missed optimization cases in #51064

---------

Co-authored-by: David Sherwood <david.sherwood at arm.com>

Added: 
    llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll

Modified: 
    llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 635bd1236196e..b3e3e1e71f101 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -231,12 +231,19 @@ class LoopIdiomRecognize {
   bool recognizePopcount();
   void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
                                PHINode *CntPhi, Value *Var);
+  bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
+                               bool ZeroCheck, size_t CanonicalSize);
+  bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
+                             Instruction *DefX, PHINode *CntPhi,
+                             Instruction *CntInst);
   bool recognizeAndInsertFFS();  /// Find First Set: ctlz or cttz
+  bool recognizeShiftUntilLessThan();
   void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
                                 Instruction *CntInst, PHINode *CntPhi,
                                 Value *Var, Instruction *DefX,
                                 const DebugLoc &DL, bool ZeroCheck,
-                                bool IsCntPhiUsedOutsideLoop);
+                                bool IsCntPhiUsedOutsideLoop,
+                                bool InsertSub = false);
 
   bool recognizeShiftUntilBitTest();
   bool recognizeShiftUntilZero();
@@ -1482,7 +1489,8 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
                     << CurLoop->getHeader()->getName() << "\n");
 
   return recognizePopcount() || recognizeAndInsertFFS() ||
-         recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
+         recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||
+         recognizeShiftUntilLessThan();
 }
 
 /// Check if the given conditional branch is based on the comparison between
@@ -1517,6 +1525,34 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
   return nullptr;
 }
 
+/// Check if the given conditional branch is based on an unsigned less-than
+/// comparison between a variable and a constant, and if the comparison is false
+/// the control yields to the loop entry. If the branch matches the behaviour,
+/// the variable involved in the comparison is returned.
+static Value *matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry,
+                                     uint64_t &Threshold) {
+  if (!BI || !BI->isConditional())
+    return nullptr;
+
+  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return nullptr;
+
+  ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1));
+  if (!CmpConst)
+    return nullptr;
+
+  BasicBlock *FalseSucc = BI->getSuccessor(1);
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+
+  if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
+    Threshold = CmpConst->getZExtValue();
+    return Cond->getOperand(0);
+  }
+
+  return nullptr;
+}
+
 // Check if the recurrence variable `VarX` is in the right form to create
 // the idiom. Returns the value coerced to a PHINode if so.
 static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
@@ -1528,6 +1564,107 @@ static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
   return nullptr;
 }
 
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+///       or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+///       or nullptr if there is no such.
+/// 3) \p InitX is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+/// 5) \p Threshold is set to the constant involved in the unsigned less-than
+///       comparison.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+///    if (x0 < 2)
+///      goto loop-exit // the precondition of the loop
+///    cnt0 = init-val
+///    do {
+///      x = phi (x0, x.next);   //PhiX
+///      cnt = phi (cnt0, cnt.next)
+///
+///      cnt.next = cnt + 1;
+///       ...
+///      x.next = x >> 1;   // DefX
+///    } while (x >= 4)
+/// loop-exit:
+/// \endcode
+static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL,
+                                          Intrinsic::ID &IntrinID,
+                                          Value *&InitX, Instruction *&CntInst,
+                                          PHINode *&CntPhi, Instruction *&DefX,
+                                          uint64_t &Threshold) {
+  BasicBlock *LoopEntry;
+
+  DefX = nullptr;
+  CntInst = nullptr;
+  CntPhi = nullptr;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  if (Value *T = matchShiftULTCondition(
+          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry,
+          Threshold))
+    DefX = dyn_cast<Instruction>(T);
+  else
+    return false;
+
+  // step 2: Check the recurrence of variable X
+  if (!DefX || !isa<PHINode>(DefX))
+    return false;
+
+  PHINode *VarPhi = cast<PHINode>(DefX);
+  int Idx = VarPhi->getBasicBlockIndex(LoopEntry);
+  if (Idx == -1)
+    return false;
+
+  DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx));
+  if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi)
+    return false;
+
+  // step 3: detect instructions corresponding to "x.next = x >> 1"
+  if (DefX->getOpcode() != Instruction::LShr)
+    return false;
+
+  IntrinID = Intrinsic::ctlz;
+  ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
+  if (!Shft || !Shft->isOne())
+    return false;
+
+  InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader());
+
+  // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+  //         or cnt.next = cnt + -1.
+  // TODO: We can skip the step. If loop trip count is known (CTLZ),
+  //       then all uses of "cnt.next" could be optimized to the trip count
+  //       plus "cnt0". Currently it is not optimized.
+  //       This step could be used to detect POPCNT instruction:
+  //       cnt.next = cnt + (x.next & 1)
+  for (Instruction &Inst : llvm::make_range(
+           LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
+    if (Inst.getOpcode() != Instruction::Add)
+      continue;
+
+    ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
+    if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
+      continue;
+
+    PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
+    if (!Phi)
+      continue;
+
+    CntInst = &Inst;
+    CntPhi = Phi;
+    break;
+  }
+  if (!CntInst)
+    return false;
+
+  return true;
+}
+
 /// Return true iff the idiom is detected in the loop.
 ///
 /// Additionally:
@@ -1756,27 +1893,35 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
   return true;
 }
 
-/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
-/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
-/// trip count returns true; otherwise, returns false.
-bool LoopIdiomRecognize::recognizeAndInsertFFS() {
-  // Give up if the loop has multiple blocks or multiple backedges.
-  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
-    return false;
+// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
+// profitable if we delete the loop.
+bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
+                                                 Value *InitX, bool ZeroCheck,
+                                                 size_t CanonicalSize) {
+  const Value *Args[] = {InitX,
+                         ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
 
-  Intrinsic::ID IntrinID;
-  Value *InitX;
-  Instruction *DefX = nullptr;
-  PHINode *CntPhi = nullptr;
-  Instruction *CntInst = nullptr;
-  // Help decide if transformation is profitable. For ShiftUntilZero idiom,
-  // this is always 6.
-  size_t IdiomCanonicalSize = 6;
+  // @llvm.dbg doesn't count as they have no semantic effect.
+  auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
+  uint32_t HeaderSize =
+      std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
 
-  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
-                                 CntInst, CntPhi, DefX))
+  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
+  InstructionCost Cost = TTI->getIntrinsicInstrCost(
+      Attrs, TargetTransformInfo::TCK_SizeAndLatency);
+  if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
     return false;
 
+  return true;
+}
+
+/// Convert CTLZ / CTTZ idiom loop into countable loop.
+/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
+/// returns false.
+bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
+                                               Value *InitX, Instruction *DefX,
+                                               PHINode *CntPhi,
+                                               Instruction *CntInst) {
   bool IsCntPhiUsedOutsideLoop = false;
   for (User *U : CntPhi->users())
     if (!CurLoop->contains(cast<Instruction>(U))) {
@@ -1818,35 +1963,107 @@ bool LoopIdiomRecognize::recognizeAndInsertFFS() {
     ZeroCheck = true;
   }
 
-  // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
-  // profitable if we delete the loop.
-
-  // the loop has only 6 instructions:
+  // FFS idiom loop has only 6 instructions:
   //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
   //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
   //  %shr = ashr %n.addr.0, 1
   //  %tobool = icmp eq %shr, 0
   //  %inc = add nsw %i.0, 1
   //  br i1 %tobool
+  size_t IdiomCanonicalSize = 6;
+  if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
+    return false;
 
-  const Value *Args[] = {InitX,
-                         ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
+  transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
+                           DefX->getDebugLoc(), ZeroCheck,
+                           IsCntPhiUsedOutsideLoop);
+  return true;
+}
 
-  // @llvm.dbg doesn't count as they have no semantic effect.
-  auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
-  uint32_t HeaderSize =
-      std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
+/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
+/// trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertFFS() {
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
 
-  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
-  InstructionCost Cost =
-    TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
-  if (HeaderSize != IdiomCanonicalSize &&
-      Cost > TargetTransformInfo::TCC_Basic)
+  Intrinsic::ID IntrinID;
+  Value *InitX;
+  Instruction *DefX = nullptr;
+  PHINode *CntPhi = nullptr;
+  Instruction *CntInst = nullptr;
+
+  if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi,
+                                 DefX))
+    return false;
+
+  return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
+}
+
+bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+    return false;
+
+  Intrinsic::ID IntrinID;
+  Value *InitX;
+  Instruction *DefX = nullptr;
+  PHINode *CntPhi = nullptr;
+  Instruction *CntInst = nullptr;
+
+  uint64_t LoopThreshold;
+  if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst,
+                                     CntPhi, DefX, LoopThreshold))
+    return false;
+
+  if (LoopThreshold == 2) {
+    // Treat as regular FFS.
+    return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
+  }
+
+  // Look for Floor Log2 Idiom.
+  if (LoopThreshold != 4)
+    return false;
+
+  // Abort if CntPhi is used outside of the loop.
+  for (User *U : CntPhi->users())
+    if (!CurLoop->contains(cast<Instruction>(U)))
+      return false;
+
+  // It is safe to assume Preheader exist as it was checked in
+  // parent function RunOnLoop.
+  BasicBlock *PH = CurLoop->getLoopPreheader();
+  auto *PreCondBB = PH->getSinglePredecessor();
+  if (!PreCondBB)
+    return false;
+  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+  if (!PreCondBI)
+    return false;
+
+  uint64_t PreLoopThreshold;
+  if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX ||
+      PreLoopThreshold != 2)
     return false;
 
+  bool ZeroCheck = true;
+
+  // the loop has only 6 instructions:
+  //  %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+  //  %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+  //  %shr = ashr %n.addr.0, 1
+  //  %tobool = icmp ult %n.addr.0, C
+  //  %inc = add nsw %i.0, 1
+  //  br i1 %tobool
+  size_t IdiomCanonicalSize = 6;
+  if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
+    return false;
+
+  // log2(x) = w − 1 − clz(x)
   transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
                            DefX->getDebugLoc(), ZeroCheck,
-                           IsCntPhiUsedOutsideLoop);
+                           /*IsCntPhiUsedOutsideLoop=*/false,
+                           /*InsertSub=*/true);
   return true;
 }
 
@@ -1961,7 +2178,7 @@ static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
 void LoopIdiomRecognize::transformLoopToCountable(
     Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
     PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
-    bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+    bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
   BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
 
   // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
@@ -1991,6 +2208,8 @@ void LoopIdiomRecognize::transformLoopToCountable(
   Type *CountTy = Count->getType();
   Count = Builder.CreateSub(
       ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
+  if (InsertSub)
+    Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1));
   Value *NewCount = Count;
   if (IsCntPhiUsedOutsideLoop)
     Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));

diff  --git a/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll b/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll
new file mode 100644
index 0000000000000..47ae4fd5b66a7
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/ctlz.ll
@@ -0,0 +1,778 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-idiom -mtriple=aarch64 < %s -S | FileCheck %s
+
+; Recognize CTLZ builtin pattern.
+; Here we'll just convert loop to countable,
+; so do not insert builtin if CPU do not support CTLZ
+;
+; int ctlz_and_other(int n, char *a)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0, n0 = n;
+;   while(n >>= 1) {
+;     a[i] = (n0 & (1 << i)) ? 1 : 0;
+;     i++;
+;   }
+;   return i;
+; }
+;
+
+; Function Attrs: norecurse nounwind uwtable
+define i32 @ctlz_and_other(i32 %n, ptr nocapture %a) {
+; CHECK-LABEL: define i32 @ctlz_and_other(
+; CHECK-SAME: i32 [[N:%.*]], ptr nocapture [[A:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[SHR8:%.*]] = lshr i32 [[ABS_N]], 1
+; CHECK-NEXT:    [[TOBOOL9:%.*]] = icmp eq i32 [[SHR8]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL9]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[SHR8]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 32, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR11:%.*]] = phi i32 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[SHR8]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 1, [[TMP3]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SHL]], [[ABS_N]]
+; CHECK-NEXT:    [[TOBOOL1:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL1]] to i8
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i8 [[CONV]], ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[SHR11]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_LCSSA:%.*]] = phi i64 [ [[TMP2]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV_NEXT_LCSSA]] to i32
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP4]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
+  %shr8 = lshr i32 %abs_n, 1
+  %tobool9 = icmp eq i32 %shr8, 0
+  br i1 %tobool9, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %while.body ], [ 0, %while.body.preheader ]
+  %shr11 = phi i32 [ %shr, %while.body ], [ %shr8, %while.body.preheader ]
+  %0 = trunc i64 %indvars.iv to i32
+  %shl = shl i32 1, %0
+  %and = and i32 %shl, %abs_n
+  %tobool1 = icmp ne i32 %and, 0
+  %conv = zext i1 %tobool1 to i8
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv
+  store i8 %conv, ptr %arrayidx, align 1
+  %indvars.iv.next = add nuw i64 %indvars.iv, 1
+  %shr = ashr i32 %shr11, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %1 = trunc i64 %indvars.iv.next to i32
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %1, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_zero_check(int n)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0;
+;   while(n) {
+;     n >>= 1;
+;     i++;
+;   }
+;   return i;
+; }
+;
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_zero_check(i32 %n) {
+; CHECK-LABEL: define i32 @ctlz_zero_check(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[TOBOOL4:%.*]] = icmp eq i32 [[ABS_N]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL4]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[ABS_N]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 32, [[TMP0]]
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[I_06:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[N_ADDR_05:%.*]] = phi i32 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[ABS_N]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[N_ADDR_05]], 1
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_06]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[TMP1]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
+  %tobool4 = icmp eq i32 %abs_n, 0
+  br i1 %tobool4, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %i.06 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.05 = phi i32 [ %shr, %while.body ], [ %abs_n, %while.body.preheader ]
+  %shr = ashr i32 %n.addr.05, 1
+  %inc = add nsw i32 %i.06, 1
+  %tobool = icmp eq i32 %shr, 0
+  br i1 %tobool, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz(int n)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz(i32 %n) {
+; CHECK-LABEL: define i32 @ctlz(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[ABS_N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP2]], [[WHILE_COND]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; This test covers how instcombine may optimise the previous ctlz case.
+;
+; int ctlz(int n)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+
+define i32 @ctlz_fold(i32 noundef %n) {
+; CHECK-LABEL: define i32 @ctlz_fold(
+; CHECK-SAME: i32 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = tail call i32 @llvm.abs.i32(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[TOBOOL_NOT5:%.*]] = icmp ult i32 [[COND]], 2
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT5]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], 1
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[N_ADDR_06:%.*]] = phi i32 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[COND]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR]] = lshr i32 [[N_ADDR_06]], 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %cond = tail call i32 @llvm.abs.i32(i32 %n, i1 true)
+  %tobool.not5 = icmp ult i32 %cond, 2
+  br i1 %tobool.not5, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %i.07 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.06 = phi i32 [ %shr, %while.body ], [ %cond, %while.body.preheader ]
+  %shr = lshr i32 %n.addr.06, 1
+  %inc = add nuw nsw i32 %i.07, 1
+  %tobool.not = icmp ult i32 %n.addr.06, 4
+  br i1 %tobool.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %inc.lcssa = phi i32 [ %inc, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %i.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+  ret i32 %i.0.lcssa
+}
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_add(int n, int i0)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = i0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_add(i32 %n, i32 %i0) {
+; CHECK-LABEL: define i32 @ctlz_add(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[I0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP2]], [[I0]]
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[ABS_N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ [[I0]], [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP4]], [[WHILE_COND]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_sub(int n, int i0)
+; {
+;   n = n >= 0 ? n : -n;
+;   int i = i0;
+;   while(n >>= 1) {
+;     i--;
+;   }
+;   return i;
+; }
+;
+;
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_sub(i32 %n, i32 %i0) {
+; CHECK-LABEL: define i32 @ctlz_sub(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[I0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS_N:%.*]] = call i32 @llvm.abs.i32(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[I0]], [[TMP2]]
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[ABS_N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ [[I0]], [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], -1
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP4]], [[WHILE_COND]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %abs_n = call i32 @llvm.abs.i32(i32 %n, i1 true)
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ %i0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, -1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_sext(short in)
+; {
+;   int n = in;
+;   if (in < 0)
+;     n = -n;
+;   int i = 0;
+;   while(n >>= 1) {
+;     i++;
+;   }
+;   return i;
+; }
+;
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define i32 @ctlz_sext(i16 %in) {
+; CHECK-LABEL: define i32 @ctlz_sext(
+; CHECK-SAME: i16 [[IN:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = call i16 @llvm.abs.i16(i16 [[IN]], i1 false)
+; CHECK-NEXT:    [[ABS_N:%.*]] = zext i16 [[ABS]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = ashr i32 [[ABS_N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[ABS_N]], [[ENTRY]] ], [ [[SHR:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[SHR]] = ashr i32 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[WHILE_END:%.*]], label [[WHILE_COND]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ [[TMP2]], [[WHILE_COND]] ]
+; CHECK-NEXT:    ret i32 [[I_0_LCSSA]]
+;
+entry:
+  %abs = call i16 @llvm.abs.i16(i16 %in, i1 false)
+  %abs_n = zext i16 %abs to i32
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %n.addr.0 = phi i32 [ %abs_n, %entry ], [ %shr, %while.cond ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %while.cond ]
+  %shr = ashr i32 %n.addr.0, 1
+  %tobool = icmp eq i32 %shr, 0
+  %inc = add nsw i32 %i.0, 1
+  br i1 %tobool, label %while.end, label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret i32 %i.0
+}
+
+
+; unsigned floor_log2(unsigned long n) {
+;   unsigned result = 0;
+;   while (n >>= 1) result++;
+;   return result;
+; }
+
+define i32 @floor_log2_use_inc(i64 noundef %n) {
+; CHECK-LABEL: define i32 @floor_log2_use_inc(
+; CHECK-SAME: i64 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL_NOT2:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[N]], i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 64, [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i64 [ [[TMP4]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[RESULT_04:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[N_ADDR_03:%.*]] = phi i64 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR]] = lshr i64 [[N_ADDR_03]], 1
+; CHECK-NEXT:    [[INC]] = add i32 [[RESULT_04]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i64 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  %tobool.not2 = icmp ult i64 %n, 2
+  br i1 %tobool.not2, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %result.04 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.03 = phi i64 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+  %shr = lshr i64 %n.addr.03, 1
+  %inc = add i32 %result.04, 1
+  %tobool.not = icmp ult i64 %n.addr.03, 4
+  br i1 %tobool.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  %inc.lcssa = phi i32 [ %inc, %while.body ]
+  br label %while.end
+
+while.end:
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+  ret i32 %result.0.lcssa
+}
+
+
+define i32 @floor_log2_use_phi(i64 noundef %n) {
+; CHECK-LABEL: define i32 @floor_log2_use_phi(
+; CHECK-SAME: i64 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL_NOT2:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[RESULT_04:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[N_ADDR_03:%.*]] = phi i64 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR]] = lshr i64 [[N_ADDR_03]], 1
+; CHECK-NEXT:    [[INC]] = add i32 [[RESULT_04]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ult i64 [[N_ADDR_03]], 4
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[RESULT_04]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  %tobool.not2 = icmp ult i64 %n, 2
+  br i1 %tobool.not2, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %result.04 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.03 = phi i64 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+  %shr = lshr i64 %n.addr.03, 1
+  %inc = add i32 %result.04, 1
+  %tobool.not = icmp ult i64 %n.addr.03, 4
+  br i1 %tobool.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  %inc.lcssa = phi i32 [ %result.04, %while.body ]
+  br label %while.end
+
+while.end:
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+  ret i32 %result.0.lcssa
+}
+
+
+; unsigned floor_log2_dec(unsigned long n) {
+;   unsigned result = 0;
+;   while (n >>= 1) result--;
+;   return result;
+; }
+
+define i32 @floor_log2_dec(i64 noundef %n) {
+; CHECK-LABEL: define i32 @floor_log2_dec(
+; CHECK-SAME: i64 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL_NOT2:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT2]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[N]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 64, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 0, [[TMP3]]
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i64 [ [[TMP2]], [[WHILE_BODY_PREHEADER]] ], [ [[TCDEC:%.*]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[RESULT_04:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[N_ADDR_03:%.*]] = phi i64 [ [[SHR:%.*]], [[WHILE_BODY]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR]] = lshr i64 [[N_ADDR_03]], 1
+; CHECK-NEXT:    [[INC]] = add i32 [[RESULT_04]], -1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i64 [[TCPHI]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[TMP4]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  %tobool.not2 = icmp ult i64 %n, 2
+  br i1 %tobool.not2, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %result.04 = phi i32 [ %inc, %while.body ], [ 0, %while.body.preheader ]
+  %n.addr.03 = phi i64 [ %shr, %while.body ], [ %n, %while.body.preheader ]
+  %shr = lshr i64 %n.addr.03, 1
+  %inc = add i32 %result.04, -1
+  %tobool.not = icmp ult i64 %n.addr.03, 4
+  br i1 %tobool.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  %inc.lcssa = phi i32 [ %inc, %while.body ]
+  br label %while.end
+
+while.end:
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %inc.lcssa, %while.end.loopexit ]
+  ret i32 %result.0.lcssa
+}
+
+
+; unsigned int_log2_rec(unsigned x) {
+;   return x == 0 ? 0 : int_log2_rec(x >> 1) + 1;
+; }
+
+define i32 @int_log2_rec(i32 noundef %x) {
+; CHECK-LABEL: define i32 @int_log2_rec(
+; CHECK-SAME: i32 noundef [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[CMP2]], label [[COND_END:%.*]], label [[COND_FALSE_PREHEADER:%.*]]
+; CHECK:       cond.false.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[X]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 32, [[TMP0]]
+; CHECK-NEXT:    br label [[COND_FALSE:%.*]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP1]], [[COND_FALSE_PREHEADER]] ], [ [[TCDEC:%.*]], [[COND_FALSE]] ]
+; CHECK-NEXT:    [[X_TR4:%.*]] = phi i32 [ [[SHR:%.*]], [[COND_FALSE]] ], [ [[X]], [[COND_FALSE_PREHEADER]] ]
+; CHECK-NEXT:    [[ACCUMULATOR_TR3:%.*]] = phi i32 [ [[ADD:%.*]], [[COND_FALSE]] ], [ 0, [[COND_FALSE_PREHEADER]] ]
+; CHECK-NEXT:    [[SHR]] = lshr i32 [[X_TR4]], 1
+; CHECK-NEXT:    [[ADD]] = add i32 [[ACCUMULATOR_TR3]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END_LOOPEXIT:%.*]], label [[COND_FALSE]]
+; CHECK:       cond.end.loopexit:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP1]], [[COND_FALSE]] ]
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    [[ACCUMULATOR_TR_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[COND_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[ACCUMULATOR_TR_LCSSA]]
+;
+entry:
+  %cmp2 = icmp eq i32 %x, 0
+  br i1 %cmp2, label %cond.end, label %cond.false.preheader
+
+cond.false.preheader:                             ; preds = %entry
+  br label %cond.false
+
+cond.false:                                       ; preds = %cond.false.preheader, %cond.false
+  %x.tr4 = phi i32 [ %shr, %cond.false ], [ %x, %cond.false.preheader ]
+  %accumulator.tr3 = phi i32 [ %add, %cond.false ], [ 0, %cond.false.preheader ]
+  %shr = lshr i32 %x.tr4, 1
+  %add = add i32 %accumulator.tr3, 1
+  %cmp = icmp ult i32 %x.tr4, 2
+  br i1 %cmp, label %cond.end.loopexit, label %cond.false
+
+cond.end.loopexit:                                ; preds = %cond.false
+  %add.lcssa = phi i32 [ %add, %cond.false ]
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.end.loopexit, %entry
+  %accumulator.tr.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %cond.end.loopexit ]
+  ret i32 %accumulator.tr.lcssa
+}
+
+
+; We can't easily transform this loop. It returns 1 for an input of both
+; 0 and 1.
+; int ctlz_do_while_use_inc(unsigned n)
+; {
+;   int i = 0;
+;   do {
+;     i++;
+;     n >>= 1;
+;   } while(n != 0);
+;   return i;
+; }
+
+define i32 @ctlz_do_while_use_inc(i32 noundef %n) {
+; CHECK-LABEL: define i32 @ctlz_do_while_use_inc(
+; CHECK-SAME: i32 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[DO_BODY:%.*]]
+; CHECK:       do.body:
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[SHR:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_0]], 1
+; CHECK-NEXT:    [[SHR]] = lshr i32 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp ult i32 [[N_ADDR_0]], 2
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[DO_END:%.*]], label [[DO_BODY]]
+; CHECK:       do.end:
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[DO_BODY]] ]
+; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
+;
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %do.body ]
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %do.body ]
+  %inc = add nuw nsw i32 %i.0, 1
+  %shr = lshr i32 %n.addr.0, 1
+  %cmp.not = icmp ult i32 %n.addr.0, 2
+  br i1 %cmp.not, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %inc.lcssa = phi i32 [ %inc, %do.body ]
+  ret i32 %inc.lcssa
+}
+
+
+; Recognize CTLZ builtin pattern.
+; Here it will replace the loop -
+; assume builtin is always profitable.
+;
+; int ctlz_do_while_use_phi(unsigned n)
+; {
+;   int phi;
+;   int inc = 0;
+;   do {
+;     phi = inc;
+;     inc++;
+;     n >>= 1;
+;   } while(n != 0);
+;   return phi;
+; }
+
+define i32 @ctlz_do_while_use_phi(i32 noundef %n) {
+; CHECK-LABEL: define i32 @ctlz_do_while_use_phi(
+; CHECK-SAME: i32 noundef [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[TMP0]], i1 false)
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 32, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 1
+; CHECK-NEXT:    br label [[DO_BODY:%.*]]
+; CHECK:       do.body:
+; CHECK-NEXT:    [[TCPHI:%.*]] = phi i32 [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TCDEC:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[N_ADDR_0:%.*]] = phi i32 [ [[N]], [[ENTRY]] ], [ [[SHR:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[INC_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC1:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[INC1]] = add nuw nsw i32 [[INC_0]], 1
+; CHECK-NEXT:    [[SHR]] = lshr i32 [[N_ADDR_0]], 1
+; CHECK-NEXT:    [[TCDEC]] = sub nsw i32 [[TCPHI]], 1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[TCDEC]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[DO_END:%.*]], label [[DO_BODY]]
+; CHECK:       do.end:
+; CHECK-NEXT:    [[INC_0_LCSSA:%.*]] = phi i32 [ [[TMP2]], [[DO_BODY]] ]
+; CHECK-NEXT:    ret i32 [[INC_0_LCSSA]]
+;
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %shr, %do.body ]
+  %inc.0 = phi i32 [ 0, %entry ], [ %inc1, %do.body ]
+  %inc1 = add nuw nsw i32 %inc.0, 1
+  %shr = lshr i32 %n.addr.0, 1
+  %cmp.not = icmp ult i32 %n.addr.0, 2
+  br i1 %cmp.not, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  ret i32 %inc.0
+}
+
+
+declare i32 @llvm.abs.i32(i32, i1)
+declare i16 @llvm.abs.i16(i16, i1)


        


More information about the llvm-commits mailing list