[llvm] 5d10753 - [SimpleLoopUnswitch] Inject loop-invariant conditions and unswitch them when it's profitable

Max Kazantsev via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 9 21:49:00 PST 2023


Author: Max Kazantsev
Date: 2023-02-10T12:48:38+07:00
New Revision: 5d10753314ed58e1f55d41118c8f082c5fc7b2d7

URL: https://github.com/llvm/llvm-project/commit/5d10753314ed58e1f55d41118c8f082c5fc7b2d7
DIFF: https://github.com/llvm/llvm-project/commit/5d10753314ed58e1f55d41118c8f082c5fc7b2d7.diff

LOG: [SimpleLoopUnswitch] Inject loop-invariant conditions and unswitch them when it's profitable

Based on https://discourse.llvm.org/t/rfc-inject-invariant-conditions-to-loops-to-enable-unswitching-and-constraint-elimination

This transform attempts to handle the following loop:
```
for (...) {
  x = <some variant>
  if (x <u C1) {} else break;
  if (x <u C2) {} else break;
}
```
Here `x` is some loop-variant value, and `C1` and `C2` are loop invariants.
As we see, this loop has no invariant checks we can unswitch on. However, there is an
invariant condition that can make the second check redundant. Specifically, it is `C1 <=u C2`.
We can modify this code in the following way:
```
for (...) {
  x = <some variant>
  if (x <u C1) {} else break;
  if (C1 <=u C2) {
  /* no check is required */
  }
  else {
    // do the check normally
    if (x <u C2) {} else break;
  }
}
```
Now we have an invariant condition `C1 <=u C2` and can unswitch on it.

This patch introduces the basic version of this transform, with some limitations,
all of them seem liftable (but needs more work & testing):
- All checks are `ult` condition;
- All branches in question stay in loop if the said condition is true and leave it otherwise;
- All in-loop branches are hot enough;

There is also a room for improvement cost model. So far we evalutate the cost of
unswitching this newly injected invariant branch the same as if we would unswitch
on 2nd condition, which is not exactly precise (but also not grossly wrong).

Differential Revision: https://reviews.llvm.org/D136233
Reviewed By: skatkov

Added: 
    llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll

Modified: 
    llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 7e08120f923d5..2fda1ac0bee74 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
@@ -78,6 +79,8 @@ STATISTIC(NumTrivial, "Number of unswitches that are trivial");
 STATISTIC(
     NumCostMultiplierSkipped,
     "Number of unswitch candidates that had their cost multiplier skipped");
+STATISTIC(NumInvariantConditionsInjected,
+          "Number of invariant conditions injected and unswitched");
 
 static cl::opt<bool> EnableNonTrivialUnswitch(
     "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
@@ -118,15 +121,53 @@ static cl::opt<bool> FreezeLoopUnswitchCond(
     cl::desc("If enabled, the freeze instruction will be added to condition "
              "of loop unswitch to prevent miscompilation."));
 
+static cl::opt<bool> InjectInvariantConditions(
+    "simple-loop-unswitch-inject-invariant-conditions", cl::Hidden,
+    cl::desc("Whether we should inject new invariants and unswitch them to "
+             "eliminate some existing (non-invariant) conditions."),
+    cl::init(true));
+
+static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold(
+    "simple-loop-unswitch-inject-invariant-condition-hotness-threshold",
+    cl::Hidden, cl::desc("Only try to inject loop invariant conditions and "
+                         "unswitch on them to eliminate branches that are "
+                         "not-taken 1/<this option> times or less."),
+    cl::init(16));
+
 namespace {
+struct CompareDesc {
+  BranchInst *Term;
+  Value *Invariant;
+  BasicBlock *InLoopSucc;
+
+  CompareDesc(BranchInst *Term, Value *Invariant, BasicBlock *InLoopSucc)
+      : Term(Term), Invariant(Invariant), InLoopSucc(InLoopSucc) {}
+};
+
+struct InjectedInvariant {
+  ICmpInst::Predicate Pred;
+  Value *LHS;
+  Value *RHS;
+  BasicBlock *InLoopSucc;
+
+  InjectedInvariant(ICmpInst::Predicate Pred, Value *LHS, Value *RHS,
+                    BasicBlock *InLoopSucc)
+      : Pred(Pred), LHS(LHS), RHS(RHS), InLoopSucc(InLoopSucc) {}
+};
+
 struct NonTrivialUnswitchCandidate {
   Instruction *TI = nullptr;
   TinyPtrVector<Value *> Invariants;
   std::optional<InstructionCost> Cost;
+  std::optional<InjectedInvariant> PendingInjection;
   NonTrivialUnswitchCandidate(
       Instruction *TI, ArrayRef<Value *> Invariants,
-      std::optional<InstructionCost> Cost = std::nullopt)
-      : TI(TI), Invariants(Invariants), Cost(Cost){};
+      std::optional<InstructionCost> Cost = std::nullopt,
+      std::optional<InjectedInvariant> PendingInjection = std::nullopt)
+      : TI(TI), Invariants(Invariants), Cost(Cost),
+        PendingInjection(PendingInjection) {};
+
+  bool hasPendingInjection() const { return PendingInjection.has_value(); }
 };
 } // end anonymous namespace.
 
@@ -2844,6 +2885,252 @@ static bool collectUnswitchCandidates(
   return !UnswitchCandidates.empty();
 }
 
+/// Returns true, if predicate described by ( \p Pred, \p LHS, \p RHS )
+/// succeeding into blocks ( \p IfTrue, \p IfFalse) can be optimized by
+/// injecting a loop-invariant condition.
+static bool shouldTryInjectInvariantCondition(
+    const ICmpInst::Predicate Pred, const Value *LHS, const Value *RHS,
+    const BasicBlock *IfTrue, const BasicBlock *IfFalse, const Loop &L) {
+  if (L.isLoopInvariant(LHS) || !L.isLoopInvariant(RHS))
+    return false;
+  // TODO: Support other predicates.
+  if (Pred != ICmpInst::ICMP_ULT)
+    return false;
+  // TODO: Support non-loop-exiting branches?
+  if (!L.contains(IfTrue) || L.contains(IfFalse))
+    return false;
+  // FIXME: For some reason this causes problems with MSSA updates, need to
+  // investigate why. So far, just don't unswitch latch.
+  if (L.getHeader() == IfTrue)
+    return false;
+  return true;
+}
+
+/// Returns true, if metadata on \p BI allows us to optimize branching into \p
+/// TakenSucc via injection of invariant conditions. The branch should be not
+/// enough and not previously unswitched, the information about this comes from
+/// the metadata.
+bool shouldTryInjectBasingOnMetadata(const BranchInst *BI,
+                                     const BasicBlock *TakenSucc) {
+  // Skip branches that have already been unswithed this way. After successful
+  // unswitching of injected condition, we will still have a copy of this loop
+  // which looks exactly the same as original one. To prevent the 2nd attempt
+  // of unswitching it in the same pass, mark this branch as "nothing to do
+  // here".
+  if (BI->hasMetadata("llvm.invariant.condition.injection.disabled"))
+    return false;
+  SmallVector<uint32_t> Weights;
+  if (!extractBranchWeights(*BI, Weights))
+    return false;
+  unsigned T = InjectInvariantConditionHotnesThreshold;
+  BranchProbability LikelyTaken(T - 1, T);
+
+  assert(Weights.size() == 2 && "Unexpected profile data!");
+  size_t Idx = BI->getSuccessor(0) == TakenSucc ? 0 : 1;
+  auto Num = Weights[Idx];
+  auto Denom = Weights[0] + Weights[1];
+  // Degenerate metadata.
+  if (Denom == 0)
+    return false;
+  BranchProbability ActualTaken(Num, Denom);
+  if (LikelyTaken > ActualTaken)
+    return false;
+  return true;
+}
+
+/// Materialize pending invariant condition of the given candidate into IR. The
+/// injected loop-invariant condition implies the original loop-variant branch
+/// condition, so the materialization turns
+///
+/// loop_block:
+///   ...
+///   br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc
+///
+/// into
+///
+/// preheader:
+///   %invariant_cond = LHS pred RHS
+/// ...
+/// loop_block:
+///   br i1 %invariant_cond, label InLoopSucc, label OriginalCheck
+/// OriginalCheck:
+///   br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc
+/// ...
+static NonTrivialUnswitchCandidate
+injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
+                                 DominatorTree &DT, LoopInfo &LI,
+                                 AssumptionCache &AC, MemorySSAUpdater *MSSAU) {
+  assert(Candidate.hasPendingInjection() && "Nothing to inject!");
+  BasicBlock *Preheader = L.getLoopPreheader();
+  assert(Preheader && "Loop is not in simplified form?");
+
+  auto Pred = Candidate.PendingInjection->Pred;
+  auto *LHS = Candidate.PendingInjection->LHS;
+  auto *RHS = Candidate.PendingInjection->RHS;
+  auto *InLoopSucc = Candidate.PendingInjection->InLoopSucc;
+  auto *TI = cast<BranchInst>(Candidate.TI);
+  auto *BB = Candidate.TI->getParent();
+  assert(InLoopSucc == TI->getSuccessor(0));
+  auto *OutOfLoopSucc = TI->getSuccessor(1);
+  // FIXME: Remove this once limitation on successors is lifted.
+  assert(L.contains(InLoopSucc) && "Not supported yet!");
+  assert(!L.contains(OutOfLoopSucc) && "Not supported yet!");
+  auto &Ctx = BB->getContext();
+
+  assert(LHS->getType() == RHS->getType() && "Type mismatch!");
+  // Do not use builder here: CreateICmp may simplify this intro a constant and
+  // unswitching will break. Better optimize it away later.
+  auto *InjectedCond =
+      ICmpInst::Create(Instruction::ICmp, Pred, LHS, RHS, "injected.cond",
+                       Preheader->getTerminator());
+  auto *OldCond = TI->getCondition();
+
+  BasicBlock *CheckBlock = BasicBlock::Create(Ctx, BB->getName() + ".check",
+                                              BB->getParent(), InLoopSucc);
+  IRBuilder<> Builder(TI);
+  auto *InvariantBr =
+      Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+
+  Builder.SetInsertPoint(CheckBlock);
+  auto *NewTerm = Builder.CreateCondBr(OldCond, InLoopSucc, OutOfLoopSucc);
+
+  TI->eraseFromParent();
+  // Prevent infinite unswitching.
+  NewTerm->setMetadata("llvm.invariant.condition.injection.disabled",
+                       MDNode::get(BB->getContext(), {}));
+
+  // Fixup phis.
+  for (auto &I : *InLoopSucc) {
+    auto *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      break;
+    auto *Inc = PN->getIncomingValueForBlock(BB);
+    PN->addIncoming(Inc, CheckBlock);
+  }
+  OutOfLoopSucc->replacePhiUsesWith(BB, CheckBlock);
+
+  SmallVector<DominatorTree::UpdateType, 4> DTUpdates = {
+    { DominatorTree::Insert, BB, CheckBlock },
+    { DominatorTree::Insert, CheckBlock, InLoopSucc },
+    { DominatorTree::Insert, CheckBlock, OutOfLoopSucc },
+    { DominatorTree::Delete, BB, OutOfLoopSucc }
+  };
+
+  DT.applyUpdates(DTUpdates);
+  if (MSSAU)
+    MSSAU->applyUpdates(DTUpdates, DT);
+  L.addBasicBlockToLoop(CheckBlock, LI);
+
+#ifdef EXPENSIVE_CHECKS
+  DT.verify();
+  LI.verify(DT);
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
+#endif
+
+  // TODO: In fact, cost of unswitching a new invariant candidate is *slightly*
+  // higher because we have just inserted a new block. Need to think how to
+  // adjust the cost of injected candidates when it was first computed.
+  LLVM_DEBUG(dbgs() << "Injected a new loop-invariant branch " << *InvariantBr
+                    << " and considering it for unswitching.");
+  ++NumInvariantConditionsInjected;
+  return NonTrivialUnswitchCandidate(InvariantBr, { InjectedCond },
+                                     Candidate.Cost);
+}
+
+/// Given chain of loop branch conditions looking like:
+///   br (Variant < Invariant1)
+///   br (Variant < Invariant2)
+///   br (Variant < Invariant3)
+///   ...
+/// collect set of invariant conditions on which we want to unswitch, which
+/// look like:
+///   Invariant1 <= Invariant2
+///   Invariant2 <= Invariant3
+///   ...
+/// Though they might not immediately exist in the IR, we can still inject them.
+static bool insertCandidatesWithPendingInjections(
+    SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, Loop &L,
+    ICmpInst::Predicate Pred, ArrayRef<CompareDesc> Compares,
+    const DominatorTree &DT) {
+
+  assert(ICmpInst::isRelational(Pred));
+  assert(ICmpInst::isStrictPredicate(Pred));
+  if (Compares.size() < 2)
+    return false;
+  ICmpInst::Predicate NonStrictPred = ICmpInst::getNonStrictPredicate(Pred);
+  for (auto Prev = Compares.begin(), Next = Compares.begin() + 1;
+       Next != Compares.end(); ++Prev, ++Next) {
+    Value *LHS = Next->Invariant;
+    Value *RHS = Prev->Invariant;
+    BasicBlock *InLoopSucc = Prev->InLoopSucc;
+    InjectedInvariant ToInject(NonStrictPred, LHS, RHS, InLoopSucc);
+    NonTrivialUnswitchCandidate Candidate(Prev->Term, { LHS, RHS },
+                                          std::nullopt, std::move(ToInject));
+    UnswitchCandidates.push_back(std::move(Candidate));
+  }
+  return true;
+}
+
+/// Collect unswitch candidates by invariant conditions that are not immediately
+/// present in the loop. However, they can be injected into the code if we
+/// decide it's profitable.
+/// An example of such conditions is following:
+///
+///   for (...) {
+///     x = load ...
+///     if (! x <u C1) break;
+///     if (! x <u C2) break;
+///     <do something>
+///   }
+///
+/// We can unswitch by condition "C1 <=u C2". If that is true, then "x <u C1 <=
+/// C2" automatically implies "x <u C2", so we can get rid of one of
+/// loop-variant checks in unswitched loop version.
+static bool collectUnswitchCandidatesWithInjections(
+    SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates,
+    IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, Loop &L,
+    const DominatorTree &DT, const LoopInfo &LI, AAResults &AA,
+    const MemorySSAUpdater *MSSAU) {
+  if (!InjectInvariantConditions)
+    return false;
+
+  if (!DT.isReachableFromEntry(L.getHeader()))
+    return false;
+  auto *Latch = L.getLoopLatch();
+  // Need to have a single latch and a preheader.
+  if (!Latch)
+    return false;
+  assert(L.getLoopPreheader() && "Must have a preheader!");
+
+  DenseMap<Value *, SmallVector<CompareDesc, 4> > CandidatesULT;
+  // Traverse the conditions that dominate latch (and therefore dominate each
+  // other).
+  for (auto *DTN = DT.getNode(Latch); L.contains(DTN->getBlock());
+       DTN = DTN->getIDom()) {
+    ICmpInst::Predicate Pred;
+    Value *LHS = nullptr, *RHS = nullptr;
+    BasicBlock *IfTrue = nullptr, *IfFalse = nullptr;
+    auto *BB = DTN->getBlock();
+    auto *Term = BB->getTerminator();
+    if (!match(Term, m_Br(m_ICmp(Pred, m_Value(LHS), m_Value(RHS)),
+                          m_BasicBlock(IfTrue), m_BasicBlock(IfFalse))))
+      continue;
+    if (!shouldTryInjectInvariantCondition(Pred, LHS, RHS, IfTrue, IfFalse, L))
+      continue;
+    if (!shouldTryInjectBasingOnMetadata(cast<BranchInst>(Term), IfTrue))
+      continue;
+    CompareDesc Desc(cast<BranchInst>(Term), RHS, IfTrue);
+    CandidatesULT[LHS].push_back(Desc);
+  }
+
+  bool Found = false;
+  for (auto &It : CandidatesULT)
+    Found |= insertCandidatesWithPendingInjections(
+        UnswitchCandidates, L, ICmpInst::ICMP_ULT, It.second, DT);
+  return Found;
+}
+
 static bool isSafeForNoNTrivialUnswitching(Loop &L, LoopInfo &LI) {
   if (!L.isSafeToClone())
     return false;
@@ -3003,10 +3290,11 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate(
     Instruction &TI = *Candidate.TI;
     ArrayRef<Value *> Invariants = Candidate.Invariants;
     BranchInst *BI = dyn_cast<BranchInst>(&TI);
-    InstructionCost CandidateCost = ComputeUnswitchedCost(
-        TI, /*FullUnswitch*/ !BI ||
-                (Invariants.size() == 1 &&
-                 Invariants[0] == skipTrivialSelect(BI->getCondition())));
+    bool FullUnswitch =
+        !BI || Candidate.hasPendingInjection() ||
+        (Invariants.size() == 1 &&
+         Invariants[0] == skipTrivialSelect(BI->getCondition()));
+    InstructionCost CandidateCost = ComputeUnswitchedCost(TI, FullUnswitch);
     // Calculate cost multiplier which is a tool to limit potentially
     // exponential behavior of loop-unswitch.
     if (EnableUnswitchCostMultiplier) {
@@ -3044,9 +3332,13 @@ static bool unswitchBestCondition(
   SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates;
   IVConditionInfo PartialIVInfo;
   Instruction *PartialIVCondBranch = nullptr;
+  collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo,
+                            PartialIVCondBranch, L, LI, AA, MSSAU);
+  collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo,
+                                          PartialIVCondBranch, L, DT, LI, AA,
+                                          MSSAU);
   // If we didn't find any candidates, we're done.
-  if (!collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo,
-                                 PartialIVCondBranch, L, LI, AA, MSSAU))
+  if (UnswitchCandidates.empty())
     return false;
 
   LLVM_DEBUG(
@@ -3065,6 +3357,11 @@ static bool unswitchBestCondition(
     return false;
   }
 
+  if (Best.hasPendingInjection())
+    Best = injectPendingInvariantConditions(Best, L, DT, LI, AC, MSSAU);
+  assert(!Best.hasPendingInjection() &&
+         "All injections should have been done by now!");
+
   if (Best.TI != PartialIVCondBranch)
     PartialIVInfo.InstToDuplicate.clear();
 

diff  --git a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
new file mode 100644
index 0000000000000..87c43083f9239
--- /dev/null
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -0,0 +1,481 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop(simple-loop-unswitch<nontrivial>),simplifycfg" | FileCheck %s
+; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop-mssa(simple-loop-unswitch<nontrivial>),simplifycfg" -verify-memoryssa | FileCheck %s
+
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) {
+; CHECK-LABEL: @test_01(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[LEN]]
+; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
+; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       guarded.us:
+; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LEN]]
+; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
+; CHECK-NEXT:    store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK:       guarded:
+; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[LEN]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !llvm.invariant.condition.injection.disabled !0
+; CHECK:       backedge:
+; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+;
+entry:
+  %len = load i32, ptr %len_p, align 4, !noundef !{}
+  br label %loop
+
+loop:                                             ; preds = %backedge, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  %el = load i32, ptr %el.ptr, align 4
+  %bound_check = icmp ult i32 %el, %limit
+  br i1 %bound_check, label %guarded, label %bound_check_failed, !prof !{!"branch_weights", i32 100, i32 1}
+
+guarded:                                          ; preds = %loop
+  %range_check = icmp ult i32 %el, %len
+  br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 100, i32 1}
+
+backedge:                                         ; preds = %guarded
+  %arr.ptr = getelementptr i32, ptr %arr, i32 %el
+  store i32 %iv, ptr %arr.ptr, align 4
+  %iv.next = add i32 %iv, 1
+  %loop_cond = icmp slt i32 %iv.next, %n
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:                                             ; preds = %backedge
+  ret i32 0
+
+bound_check_failed:                               ; preds = %loop
+  ret i32 -1
+
+range_check_failed:                               ; preds = %guarded
+  ret i32 -2
+}
+
+; Should not optimize: profile metadata is missing.
+define i32 @test_01_neg_void_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) {
+; CHECK-LABEL: @test_01_neg_void_profile(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]]
+; CHECK:       guarded:
+; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[LEN]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ -1, [[LOOP]] ], [ -2, [[GUARDED]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+;
+entry:
+  %len = load i32, ptr %len_p, align 4, !noundef !{}
+  br label %loop
+
+loop:                                             ; preds = %backedge, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  %el = load i32, ptr %el.ptr, align 4
+  %bound_check = icmp ult i32 %el, %limit
+  br i1 %bound_check, label %guarded, label %bound_check_failed
+
+guarded:                                          ; preds = %loop
+  %range_check = icmp ult i32 %el, %len
+  br i1 %range_check, label %backedge, label %range_check_failed
+
+backedge:                                         ; preds = %guarded
+  %arr.ptr = getelementptr i32, ptr %arr, i32 %el
+  store i32 %iv, ptr %arr.ptr, align 4
+  %iv.next = add i32 %iv, 1
+  %loop_cond = icmp slt i32 %iv.next, %n
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:                                             ; preds = %backedge
+  ret i32 0
+
+bound_check_failed:                               ; preds = %loop
+  ret i32 -1
+
+range_check_failed:                               ; preds = %guarded
+  ret i32 -2
+}
+
+; Same as test_01, but n and limit are constants.
+define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %len_p) {
+; CHECK-LABEL: @test_01_constants(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 200, 300
+; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
+; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 200
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK:       guarded.us:
+; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 300
+; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
+; CHECK-NEXT:    store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], 1000
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], 200
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+;
+entry:
+  %len = load i32, ptr %len_p, align 4, !noundef !{}
+  br label %loop
+
+loop:                                             ; preds = %backedge, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  %el = load i32, ptr %el.ptr, align 4
+  %bound_check = icmp ult i32 %el, 200
+  br i1 %bound_check, label %guarded, label %bound_check_failed, !prof !{!"branch_weights", i32 100, i32 1}
+
+guarded:                                          ; preds = %loop
+  %range_check = icmp ult i32 %el, 300
+  br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 100, i32 1}
+
+backedge:                                         ; preds = %guarded
+  %arr.ptr = getelementptr i32, ptr %arr, i32 %el
+  store i32 %iv, ptr %arr.ptr, align 4
+  %iv.next = add i32 %iv, 1
+  %loop_cond = icmp slt i32 %iv.next, 1000
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:                                             ; preds = %backedge
+  ret i32 0
+
+bound_check_failed:                               ; preds = %loop
+  ret i32 -1
+
+range_check_failed:                               ; preds = %guarded
+  ret i32 -2
+}
+
+define i32 @test_01_neg_degenerate_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) {
+; CHECK-LABEL: @test_01_neg_degenerate_profile(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK:       guarded:
+; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[LEN]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF2:![0-9]+]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ -1, [[LOOP]] ], [ -2, [[GUARDED]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+;
+entry:
+  %len = load i32, ptr %len_p, align 4, !noundef !{}
+  br label %loop
+
+loop:                                             ; preds = %backedge, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  %el = load i32, ptr %el.ptr, align 4
+  %bound_check = icmp ult i32 %el, %limit
+  br i1 %bound_check, label %guarded, label %bound_check_failed, !prof !{!"branch_weights", i32 100, i32 1}
+
+guarded:                                          ; preds = %loop
+  %range_check = icmp ult i32 %el, %len
+  br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 0, i32 0}
+
+backedge:                                         ; preds = %guarded
+  %arr.ptr = getelementptr i32, ptr %arr, i32 %el
+  store i32 %iv, ptr %arr.ptr, align 4
+  %iv.next = add i32 %iv, 1
+  %loop_cond = icmp slt i32 %iv.next, %n
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:                                             ; preds = %backedge
+  ret i32 0
+
+bound_check_failed:                               ; preds = %loop
+  ret i32 -1
+
+range_check_failed:                               ; preds = %guarded
+  ret i32 -2
+}
+
+; Should not optimize: cold branch.
+define i32 @test_01_neg_cold(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) {
+; CHECK-LABEL: @test_01_neg_cold(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK:       guarded:
+; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[LEN]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3:![0-9]+]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ -1, [[LOOP]] ], [ -2, [[GUARDED]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+;
+entry:
+  %len = load i32, ptr %len_p, align 4, !noundef !{}
+  br label %loop
+
+loop:                                             ; preds = %backedge, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  %el = load i32, ptr %el.ptr, align 4
+  %bound_check = icmp ult i32 %el, %limit
+  br i1 %bound_check, label %guarded, label %bound_check_failed, !prof !{!"branch_weights", i32 100, i32 1}
+
+guarded:                                          ; preds = %loop
+  %range_check = icmp ult i32 %el, %len
+  br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 2, i32 3}
+
+backedge:                                         ; preds = %guarded
+  %arr.ptr = getelementptr i32, ptr %arr, i32 %el
+  store i32 %iv, ptr %arr.ptr, align 4
+  %iv.next = add i32 %iv, 1
+  %loop_cond = icmp slt i32 %iv.next, %n
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:                                             ; preds = %backedge
+  ret i32 0
+
+bound_check_failed:                               ; preds = %loop
+  ret i32 -1
+
+range_check_failed:                               ; preds = %guarded
+  ret i32 -2
+}
+
+define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) {
+; CHECK-LABEL: @test_02(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK:       guarded:
+; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[LEN]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ -1, [[LOOP]] ], [ -2, [[GUARDED]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+;
+entry:
+  %len = load i32, ptr %len_p, align 4, !noundef !{}
+  br label %loop
+
+loop:                                             ; preds = %backedge, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  %el = load i32, ptr %el.ptr, align 4
+  %bound_check = icmp sge i32 %el, 0
+  br i1 %bound_check, label %guarded, label %bound_check_failed, !prof !{!"branch_weights", i32 100, i32 1}
+
+guarded:                                          ; preds = %loop
+  %range_check = icmp ult i32 %el, %len
+  br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 100, i32 1}
+
+backedge:                                         ; preds = %guarded
+  %arr.ptr = getelementptr i32, ptr %arr, i32 %el
+  store i32 %iv, ptr %arr.ptr, align 4
+  %iv.next = add i32 %iv, 1
+  %loop_cond = icmp slt i32 %iv.next, %n
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:                                             ; preds = %backedge
+  ret i32 0
+
+bound_check_failed:                               ; preds = %loop
+  ret i32 -1
+
+range_check_failed:                               ; preds = %guarded
+  ret i32 -2
+}
+
+define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) {
+; CHECK-LABEL: @test_03(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp slt i32 [[EL]], 0
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET:%.*]], label [[GUARDED:%.*]], !prof [[PROF4:![0-9]+]]
+; CHECK:       guarded:
+; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[LEN]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ -1, [[LOOP]] ], [ -2, [[GUARDED]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+;
+entry:
+  %len = load i32, ptr %len_p, align 4, !noundef !{}
+  br label %loop
+
+loop:                                             ; preds = %backedge, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %el.ptr = getelementptr i32, ptr %p, i32 %iv
+  %el = load i32, ptr %el.ptr, align 4
+  %bound_check = icmp slt i32 %el, 0
+  br i1 %bound_check, label %bound_check_failed, label %guarded, !prof !{!"branch_weights", i32 1, i32 100}
+
+guarded:                                          ; preds = %loop
+  %range_check = icmp ult i32 %el, %len
+  br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 100, i32 1}
+
+backedge:                                         ; preds = %guarded
+  %arr.ptr = getelementptr i32, ptr %arr, i32 %el
+  store i32 %iv, ptr %arr.ptr, align 4
+  %iv.next = add i32 %iv, 1
+  %loop_cond = icmp slt i32 %iv.next, %n
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:                                             ; preds = %backedge
+  ret i32 0
+
+bound_check_failed:                               ; preds = %loop
+  ret i32 -1
+
+range_check_failed:                               ; preds = %guarded
+  ret i32 -2
+}
+
+define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %len_p) {
+; CHECK-LABEL: @test_04(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LEN:%.*]] = load i32, ptr [[LEN_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[EL:%.*]] = load i8, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp slt i8 [[EL]], 0
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET:%.*]], label [[GUARDED:%.*]], !prof [[PROF4]]
+; CHECK:       guarded:
+; CHECK-NEXT:    [[EL_WIDE:%.*]] = zext i8 [[EL]] to i32
+; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL_WIDE]], [[LEN]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_WIDE]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ -1, [[LOOP]] ], [ -2, [[GUARDED]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+;
+entry:
+  %len = load i32, ptr %len_p, align 4, !noundef !{}
+  br label %loop
+
+loop:                                             ; preds = %backedge, %entry
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+  %el.ptr = getelementptr i8, ptr %p, i32 %iv
+  %el = load i8, ptr %el.ptr, align 4
+  %bound_check = icmp slt i8 %el, 0
+  br i1 %bound_check, label %bound_check_failed, label %guarded, !prof !{!"branch_weights", i32 1, i32 100}
+
+guarded:                                          ; preds = %loop
+  %el.wide = zext i8 %el to i32
+  %range_check = icmp ult i32 %el.wide, %len
+  br i1 %range_check, label %backedge, label %range_check_failed, !prof !{!"branch_weights", i32 100, i32 1}
+
+backedge:                                         ; preds = %guarded
+  %arr.ptr = getelementptr i32, ptr %arr, i32 %el.wide
+  store i32 %iv, ptr %arr.ptr, align 4
+  %iv.next = add i32 %iv, 1
+  %loop_cond = icmp slt i32 %iv.next, %n
+  br i1 %loop_cond, label %loop, label %exit
+
+exit:                                             ; preds = %backedge
+  ret i32 0
+
+bound_check_failed:                               ; preds = %loop
+  ret i32 -1
+
+range_check_failed:                               ; preds = %guarded
+  ret i32 -2
+}


        


More information about the llvm-commits mailing list