[llvm-branch-commits] [llvm] [LoopUnroll] Fix freqs for unconditional latches: N>2, fast (PR #182404)

Thu Feb 26 15:54:58 PST 2026

https://github.com/jdenny-ornl updated https://github.com/llvm/llvm-project/pull/182404

>From 06c90370c6d76411be29c2a9dac35bd2f89a5f0f Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Thu, 19 Feb 2026 17:38:13 -0500
Subject: [PATCH] [LoopUnroll] Fix freqs for unconditional latches: N>2, fast

This patch extends PR #179520 to the N > 2 case, where N is the number
of remaining conditional latches.  Its strategy is to apply the
original loop's probability to all N latches and then, as needed,
adjust as few of them as possible.
---
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      | 129 ++++-
 .../branch-weights-freq/unroll-complete.ll    | 480 ++++++++++++++++++
 .../branch-weights-freq/unroll-epilog.ll      |  68 +++
 .../unroll-partial-unconditional-latch.ll     |  66 +++
 4 files changed, 739 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 529cbd3f5b5da..404e254c8a66f 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -492,9 +492,10 @@ static bool canHaveUnrollRemainder(const Loop *L) {
 // original loop iterations.
 //
 // There are often many sets of latch probabilities that can produce the
-// original total loop body frequency.  For now, this function computes uniform
-// probabilities when the number of remaining conditional latches is <= 2 and
-// does not handle other cases.
+// original total loop body frequency.  If there are many remaining conditional
+// latches, this function just quickly hacks a few of their probabilities to
+// restore the original total loop body frequency.  Otherwise, it determines
+// less arbitrary probabilities.
 static void fixProbContradiction(UnrollLoopOptions ULO,
                                  BranchProbability OriginalLoopProb,
                                  bool CompletelyUnroll,
@@ -557,6 +558,13 @@ static void fixProbContradiction(UnrollLoopOptions ULO,
   // FreqDesired is the frequency implied by the original loop probability.
   double FreqDesired = 1 / (1 - OriginalLoopProb.toDouble());
 
+  // Get the probability at CondLatches[I].
+  auto GetProb = [&](unsigned I) {
+    BranchInst *B = cast<BranchInst>(CondLatches[I]->getTerminator());
+    bool FirstTargetIsNext = B->getSuccessor(0) == CondLatchNexts[I];
+    return getBranchProbability(B, FirstTargetIsNext).toDouble();
+  };
+
   // Set the probability at CondLatches[I] to Prob.
   auto SetProb = [&](unsigned I, double Prob) {
     BranchInst *B = cast<BranchInst>(CondLatches[I]->getTerminator());
@@ -597,6 +605,12 @@ static void fixProbContradiction(UnrollLoopOptions ULO,
   // - For n <= 2, we can use simple formulas to solve the above polynomial
   //   equations exactly for p without performing a search.
 
+  // When iterating for a solution, we stop early if we find probabilities
+  // that produce a Freq whose difference from FreqDesired is small
+  // (FreqPrec).  Otherwise, we expect to compute a solution at least that
+  // accurate (but surely far more accurate).
+  const double FreqPrec = 1e-6;
+
   // Compute the probability that, used at CondLaches[0] where
   // CondLatches.size() == 1, gets as close as possible to FreqDesired.
   auto ComputeProbForLinear = [&]() {
@@ -624,13 +638,120 @@ static void fixProbContradiction(UnrollLoopOptions ULO,
     return Prob;
   };
 
+  // Compute the probability required at CondLatches[ComputeIdx] to get as close
+  // as possible to FreqDesired without replacing probabilities elsewhere in
+  // CondLatches.  Return {Prob, Freq} where 0 <= Prob <= 1 and Freq is the new
+  // frequency.
+  auto ComputeProb = [&](unsigned ComputeIdx) -> std::pair<double, double> {
+    assert(ComputeIdx < CondLatches.size());
+
+    // Accumulate the frequency from before ComputeIdx into FreqBeforeCompute,
+    // and accumulate the rest in Freq without yet multiplying the latter by any
+    // probability for ComputeIdx (i.e., treat it as 1 for now).
+    double ProbReaching = 1;     // p^0
+    double Freq = IterCounts[0]; // c_0*p^0
+    double FreqBeforeCompute;
+    for (unsigned I = 0, E = CondLatches.size(); I < E; ++I) {
+      // Get the branch probability for CondLatches[I].
+      double Prob;
+      if (I == ComputeIdx) {
+        FreqBeforeCompute = Freq;
+        Freq = 0;
+        Prob = 1;
+      } else {
+        Prob = GetProb(I);
+      }
+      ProbReaching *= Prob;                     // p^(I+1)
+      Freq += IterCounts[I + 1] * ProbReaching; // c_(I+1)*p^(I+1)
+    }
+
+    // Compute the required probability, and limit it to a valid probability (0
+    // <= p <= 1).  See the Freq formula below for how to derive the ProbCompute
+    // formula.
+    double ProbReachingBackedge = CompletelyUnroll ? 0 : ProbReaching;
+    double ProbComputeNumerator = FreqDesired - FreqBeforeCompute;
+    double ProbComputeDenominator = Freq + FreqDesired * ProbReachingBackedge;
+    double ProbCompute;
+    if (ProbComputeNumerator <= 0) {
+      // FreqBeforeCompute has already reached or surpassed FreqDesired, so add
+      // no more frequency.  It is possible that ProbComputeDenominator == 0
+      // here because some latch probability (maybe the original) was set to
+      // zero, so this check avoids setting ProbCompute=1 (in the else if below)
+      // and division by zero where the numerator <= 0 (in the else below).
+      ProbCompute = 0;
+    } else if (ProbComputeDenominator == 0) {
+      // Analytically, this case seems impossible.  It would occur if either:
+      // - Both Freq and FreqDesired are zero.  But the latter would cause
+      //   ProbComputeNumerator < 0, which we catch above, and FreqDesired
+      //   should always be >= 1 anyway.
+      // - There are no iterations after CondLatches[ComputeIdx], not even via
+      //   a backedge, so that both Freq and ProbReachingBackedge are zero.
+      //   But iterations should exist after even the last conditional latch.
+      // - Some latch probability (maybe the original) was set to zero so that
+      //   both Freq and ProbReachingBackedge are zero.  But that should not
+      //   have happened because, according to the above ProbComputeNumerator
+      //   check, we have not yet reached FreqDesired (which, if the original
+      //   latch probability is zero, is just 1 and thus always reached or
+      //   surpassed).
+      //
+      // Numerically, perhaps this case is possible.  We interpret it to mean we
+      // need more frequency (ProbComputeNumerator > 0) but have no way to get
+      // any (ProbComputeDenominator is analytically too small to distinguish it
+      // from 0 in floating point), suggesting infinite probability is needed,
+      // but 1 is the maximum valid probability and thus the best we can do.
+      //
+      // TODO: Cover this case in the test suite if you can.
+      ProbCompute = 1;
+    } else {
+      ProbCompute = ProbComputeNumerator / ProbComputeDenominator;
+      ProbCompute = std::max(ProbCompute, 0.);
+      ProbCompute = std::min(ProbCompute, 1.);
+    }
+
+    // Compute the resulting total frequency.
+    if (ProbReachingBackedge * ProbCompute == 1) {
+      // Analytically, this case seems impossible.  It requires that there is a
+      // backedge and that FreqDesired == infinity so that every conditional
+      // latch's probability had to be set to 1.  But FreqDesired == infinity
+      // means OriginalLoopProb.isOne(), which we guarded against earlier.
+      //
+      // Numerically, perhaps this case is possible.  We interpret it to mean
+      // that analytically the probability has to be so near 1 that, in floating
+      // point, the frequency is computed as infinite.
+      //
+      // TODO: Cover this case in the test suite if you can.
+      Freq = std::numeric_limits<double>::infinity();
+    } else {
+      assert(FreqBeforeCompute > 0 &&
+             "Expected at least one iteration before first latch");
+      // In this equation, if we replace the left-hand side with FreqDesired and
+      // then solve for ProbCompute, we get the ProbCompute formula above.
+      Freq = (FreqBeforeCompute + Freq * ProbCompute) /
+             (1 - ProbReachingBackedge * ProbCompute);
+    }
+    return {ProbCompute, Freq};
+  };
+
   // Determine and set branch weights.
   if (CondLatches.size() == 1) {
     SetAllProbs(ComputeProbForLinear());
   } else if (CondLatches.size() == 2) {
     SetAllProbs(ComputeProbForQuadratic());
   } else {
-    // FIXME: Handle CondLatches.size() > 2.
+    // The polynomial is too complex for a simple formula, so the quick and
+    // dirty fix has been selected.  Adjust probabilities starting from the
+    // first latch, which has the most influence on the total frequency, so
+    // starting there should minimize the number of latches that have to be
+    // visited.  We do have to iterate because the first latch alone might not
+    // be enough.  For example, we might need to set all probabilities to 1 if
+    // the frequency is the unroll factor.
+    for (unsigned I = 0; I != CondLatches.size(); ++I) {
+      double Prob, Freq;
+      std::tie(Prob, Freq) = ComputeProb(I);
+      SetProb(I, Prob);
+      if (fabs(Freq - FreqDesired) < FreqPrec)
+        break;
+    }
   }
 
   // FIXME: We have not considered non-latch loop exits:
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
index 3d87ee185b554..353e74be9fbd1 100644
--- a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
@@ -491,6 +491,486 @@
 ;     UR313x:     br label %do.end
 ;     UR313x:     !0 = !{!"branch_weights", i32 -2147483648, i32 0}
 
+; ------------------------------------------------------------------------------
+; Check 4 max iterations:
+; - Unroll count of >=4 should always produce complete unrolling.
+; - That produces <=3 unrolled iteration latches.  3 is the lowest number where
+;   the implementation cannot compute uniform weights using a simple formula.
+;
+; Original loop body frequency is 5 (loop weight 4), which is impossibly high.
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/4/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4510
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4510
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4510
+;
+;     The sum of the new do.body* cannot reach the old do.body, which is
+;     impossibly high.
+;     ORIG4510: - do.body: float = 5.0,
+;     UR4510:   - do.body: float = 1.0,
+;     UR4510:   - do.body.1: float = 1.0,
+;     UR4510:   - do.body.2: float = 1.0,
+;     UR4510:   - do.body.3: float = 1.0,
+;
+;     The probabilities are maximized to try to reach the original frequency.
+;     UR4510: call void @f
+;     UR4510: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR4510: call void @f
+;     UR4510: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+;     UR4510: call void @f
+;     UR4510: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+;     UR4510: call void @f
+;     UR4510: br label %do.end
+;     UR4510: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/4/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4540
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4540
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4540
+;
+;     The new do.body contains 4 of the original loop's iterations, so multiply
+;     it by 4, which is less than the old do.body, which is impossibly high.
+;     ORIG4540: - do.body: float = 5.0,
+;     UR4540:   - do.body: float = 1.0,
+;
+;     UR4540:     call void @f
+;     UR4540-NOT: br
+;     UR4540:     call void @f
+;     UR4540-NOT: br
+;     UR4540:     call void @f
+;     UR4540-NOT: br
+;     UR4540:     call void @f
+;     UR4540:     ret void
+;
+;   Use a constant iteration count but now the loop upper bound computation can
+;   overflow.  When it does, the loop induction variable is greater than it
+;   immediately, so the initial unrolled iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/4/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG454x
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR454x
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR454x
+;
+;     The new do.body.1 contains 3 of the original loop's iterations, so
+;     multiply it by 3, and add the new do.body, but that sum is less than the
+;     old do.body, which is impossibly high.
+;     ORIG454x: - do.body: float = 5.0,
+;     UR454x:   - do.body: float = 1.0,
+;     UR454x:   - do.body.1: float = 1.0,
+;
+;     The sole probability is maximized to try to reach the original frequency.
+;     UR454x:     call void @f
+;     UR454x:     br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR454x:     call void @f
+;     UR454x-NOT: br
+;     UR454x:     call void @f
+;     UR454x-NOT: br
+;     UR454x:     call void @f
+;     UR454x:     br label %do.end
+;     UR454x:     !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Original loop body frequency is 4 (loop weight 3).
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/3/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4410
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4410
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4410
+;
+;     The sum of the new do.body* is the old do.body.
+;     ORIG4410: - do.body: float = 4.0,
+;     UR4410:   - do.body: float = 1.0,
+;     UR4410:   - do.body.1: float = 1.0,
+;     UR4410:   - do.body.2: float = 1.0,
+;     UR4410:   - do.body.3: float = 1.0,
+;
+;     UR4410: call void @f
+;     UR4410: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR4410: call void @f
+;     UR4410: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+;     UR4410: call void @f
+;     UR4410: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+;     UR4410: call void @f
+;     UR4410: br label %do.end
+;     UR4410: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/3/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4440
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4440
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4440
+;
+;     The new do.body contains 4 of the original loop's iterations, so multiply
+;     it by 4 to get the old do.body.
+;     ORIG4440: - do.body: float = 4.0,
+;     UR4440:   - do.body: float = 1.0,
+;
+;     UR4440:     call void @f
+;     UR4440-NOT: br
+;     UR4440:     call void @f
+;     UR4440-NOT: br
+;     UR4440:     call void @f
+;     UR4440-NOT: br
+;     UR4440:     call void @f
+;     UR4440:     ret void
+;
+;   Use a constant iteration count but now the loop upper bound computation can
+;   overflow.  When it does, the loop induction variable is greater than it
+;   immediately, so the initial unrolled iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/3/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG444x
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR444x
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR444x
+;
+;     The new do.body.1 contains 3 of the original loop's iterations, so
+;     multiply it by 3, and add the new do.body to get the old do.body.
+;     ORIG444x: - do.body: float = 4.0,
+;     UR444x:   - do.body: float = 1.0,
+;     UR444x:   - do.body.1: float = 1.0,
+;
+;     UR444x:     call void @f
+;     UR444x:     br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR444x:     call void @f
+;     UR444x-NOT: br
+;     UR444x:     call void @f
+;     UR444x-NOT: br
+;     UR444x:     call void @f
+;     UR444x:     br label %do.end
+;     UR444x:     !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Original loop body frequency is 3 (loop weight 2).  This is our first case
+; where the new probabilities vary.
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/2/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4310
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4310
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4310
+;
+;     The sum of the new do.body* is always approximately the old do.body.
+;     ORIG4310: - do.body: float = 3.0,
+;     UR4310: - do.body: float = 1.0,
+;     UR4310: - do.body.1: float = 0.94737,
+;     UR4310: - do.body.2: float = 0.63158,
+;     UR4310: - do.body.3: float = 0.42105,
+;
+;     UR4310:  call void @f
+;     UR4310:  br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR4310:  call void @f
+;     UR4310:  br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !1
+;     UR4310:  call void @f
+;     UR4310:  br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !1
+;     UR4310:  call void @f
+;     UR4310:  br label %do.end
+;     UR4310:  !0 = !{!"branch_weights", i32 113025456, i32 2034458192}
+;     UR4310:  !1 = !{!"branch_weights", i32 1, i32 2}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/2/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4340
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4340
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4340
+;
+;     The new do.body contains 4 of the original loop's iterations, so multiply
+;     it by 4, which is greater than the old do.body, which is impossibly low.
+;     ORIG4340: - do.body: float = 3.0,
+;     UR4340:   - do.body: float = 1.0,
+;
+;     UR4340:     call void @f
+;     UR4340-NOT: br
+;     UR4340:     call void @f
+;     UR4340-NOT: br
+;     UR4340:     call void @f
+;     UR4340-NOT: br
+;     UR4340:     call void @f
+;     UR4340:     ret void
+;
+;   Use a constant iteration count but now the loop upper bound computation can
+;   overflow.  When it does, the loop induction variable is greater than it
+;   immediately, so the initial unrolled iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/2/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG434x
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR434x
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR434x
+;
+;     The new do.body.1 contains 3 of the original loop's iterations, so
+;     multiply it by 3, and add the new do.body to get the old do.body.
+;     ORIG434x: - do.body: float = 3.0,
+;     UR434x:   - do.body: float = 1.0,
+;     UR434x:   - do.body.1: float = 0.66667,
+;
+;     UR434x:     call void @f
+;     UR434x:     br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR434x:     call void @f
+;     UR434x-NOT: br
+;     UR434x:     call void @f
+;     UR434x-NOT: br
+;     UR434x:     call void @f
+;     UR434x:     br label %do.end
+;     UR434x:     !0 = !{!"branch_weights", i32 715827884, i32 1431655764}
+;
+; Original loop body frequency is 2 (loop weight 1).
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/1/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4210
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4210
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4210
+;
+;     The sum of the new do.body* is always the old do.body.
+;     ORIG4210: - do.body: float = 2.0,
+;     UR4210: - do.body: float = 1.0,
+;     UR4210: - do.body.1: float = 0.57143,
+;     UR4210: - do.body.2: float = 0.28571,
+;     UR4210: - do.body.3: float = 0.14286,
+;
+;     UR4210: call void @f
+;     UR4210: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR4210: call void @f
+;     UR4210: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !1
+;     UR4210: call void @f
+;     UR4210: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !1
+;     UR4210: call void @f
+;     UR4210: br label %do.end
+;     UR4210: !0 = !{!"branch_weights", i32 920350135, i32 1227133513}
+;     UR4210: !1 = !{!"branch_weights", i32 1, i32 1}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/1/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4240
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4240
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4240
+;
+;     The new do.body contains 4 of the original loop's iterations, so multiply
+;     it by 4, which is greater than the old do.body, which is impossibly low.
+;     ORIG4240: - do.body: float = 2.0,
+;     UR4240:   - do.body: float = 1.0,
+;
+;     UR4240:     call void @f
+;     UR4240-NOT: br
+;     UR4240:     call void @f
+;     UR4240-NOT: br
+;     UR4240:     call void @f
+;     UR4240-NOT: br
+;     UR4240:     call void @f
+;     UR4240:     ret void
+;
+;   Use a constant iteration count but now the loop upper bound computation can
+;   overflow.  When it does, the loop induction variable is greater than it
+;   immediately, so the initial unrolled iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/1/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG424x
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR424x
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR424x
+;
+;     The new do.body.1 contains 3 of the original loop's iterations, so
+;     multiply it by 3, and add the new do.body to get the old do.body.
+;     ORIG424x: - do.body: float = 2.0,
+;     UR424x:   - do.body: float = 1.0,
+;     UR424x:   - do.body.1: float = 0.33333,
+;
+;     UR424x:     call void @f
+;     UR424x:     br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR424x:     call void @f
+;     UR424x-NOT: br
+;     UR424x:     call void @f
+;     UR424x-NOT: br
+;     UR424x:     call void @f
+;     UR424x:     br label %do.end
+;     UR424x:     !0 = !{!"branch_weights", i32 1431655765, i32 715827883}
+;
+; Original loop body frequency is 1 (loop weight 0).
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/0/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4110
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4110
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4110
+;
+;     The sum of the new do.body* is approximately the old do.body.
+;     ORIG4110: - do.body: float = 1.0,
+;     UR4110:   - do.body: float = 1.0,
+;     UR4110:   - do.body.1: float = 0.0{{(0000[0-9]*)?}},
+;     UR4110:   - do.body.2: float = 0.0{{(0000[0-9]*)?}},
+;     UR4110:   - do.body.3: float = 0.0{{(0000[0-9]*)?}},
+;
+;     UR4110: call void @f
+;     UR4110: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR4110: call void @f
+;     UR4110: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+;     UR4110: call void @f
+;     UR4110: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+;     UR4110: call void @f
+;     UR4110: br label %do.end
+;     UR4110: !0 = !{!"branch_weights", i32 1, i32 0}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/0/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4140
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4140
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4140
+;
+;     The new do.body contains 4 of the original loop's iterations, so multiply
+;     it by 4, which is greater than the old do.body, which is impossibly low.
+;     ORIG4140: - do.body: float = 1.0,
+;     UR4140:   - do.body: float = 1.0,
+;
+;     UR4140:     call void @f
+;     UR4140-NOT: br
+;     UR4140:     call void @f
+;     UR4140-NOT: br
+;     UR4140:     call void @f
+;     UR4140-NOT: br
+;     UR4140:     call void @f
+;     UR4140:     ret void
+;
+;   Use a constant iteration count but now the loop upper bound computation can
+;   overflow.  When it does, the loop induction variable is greater than it
+;   immediately, so the initial unrolled iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/0/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG414x
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR414x
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR414x
+;
+;     The new do.body.1 contains 3 of the original loop's iterations, so
+;     multiply it by 3, and add the new do.body to get approximately the old
+;     do.body.
+;     ORIG414x: - do.body: float = 1.0,
+;     UR414x:   - do.body: float = 1.0,
+;     UR414x:   - do.body.1: float = 0.0{{(0000[0-9]*)?}},
+;
+;     UR414x:     call void @f
+;     UR414x:     br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR414x:     call void @f
+;     UR414x-NOT: br
+;     UR414x:     call void @f
+;     UR414x-NOT: br
+;     UR414x:     call void @f
+;     UR414x:     br label %do.end
+;     UR414x:     !0 = !{!"branch_weights", i32 -2147483648, i32 0}
+
+; ------------------------------------------------------------------------------
+; Check 5 max iterations:
+; - Unroll count of >=5 should always produce complete unrolling.
+; - That produces <=4 unrolled iteration latches.  When at least 3 remain
+;   conditional, the implementation cannot compute uniform weights using a
+;   simple formula.
+;
+; Original loop body frequency is 5 (loop weight 4).
+;
+;   RUN: sed -e s/@MAX@/5/ -e s/@W@/4/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;   RUN: %{bf-fc} ORIG5510
+;   RUN: %{ur-bf} -unroll-count=5 | %{fc} UR5510
+;   RUN: %{ur-bf} -unroll-count=6 | %{fc} UR5510
+;
+;   The sum of the new do.body* is the old do.body.
+;   ORIG5510: - do.body: float = 5.0,
+;   UR5510:   - do.body: float = 1.0,
+;   UR5510:   - do.body.1: float = 1.0,
+;   UR5510:   - do.body.2: float = 1.0,
+;   UR5510:   - do.body.3: float = 1.0,
+;   UR5510:   - do.body.4: float = 1.0,
+;
+;   All continue probabilities are approximately 1, but somehow there is less
+;   precision in the calculation of the last case.
+;   UR5510: call void @f
+;   UR5510: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;   UR5510: call void @f
+;   UR5510: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+;   UR5510: call void @f
+;   UR5510: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+;   UR5510: call void @f
+;   UR5510: br i1 %{{.*}}, label %do.end, label %do.body.4, !prof !1
+;   UR5510: call void @f
+;   UR5510: br label %do.end
+;   UR5510: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;   UR5510: !1 = !{!"branch_weights", i32 10, i32 2147483638}
+;
+; Original loop body frequency is 4 (loop weight 3).
+;
+;   RUN: sed -e s/@MAX@/5/ -e s/@W@/3/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;   RUN: %{bf-fc} ORIG5410
+;   RUN: %{ur-bf} -unroll-count=5 | %{fc} UR5410
+;   RUN: %{ur-bf} -unroll-count=6 | %{fc} UR5410
+;
+;   The sum of the new do.body* is always the old do.body.
+;   ORIG5410: - do.body: float = 4.0,
+;   UR5410: - do.body: float = 1.0,
+;   UR5410: - do.body.1: float = 1.0,
+;   UR5410: - do.body.2: float = 0.86486,
+;   UR5410: - do.body.3: float = 0.64865,
+;   UR5410: - do.body.4: float = 0.48649,
+;
+;   This is our first case where the implementation must adjust multiple
+;   probabilities to something other than the original latch probability but
+;   does not just set all probabilities to the limit of 1 or 0.
+;   UR5410: call void @f
+;   UR5410: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;   UR5410: call void @f
+;   UR5410: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !1
+;   UR5410: call void @f
+;   UR5410: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !2
+;   UR5410: call void @f
+;   UR5410: br i1 %{{.*}}, label %do.end, label %do.body.4, !prof !2
+;   UR5410: call void @f
+;   UR5410: br label %do.end
+;   UR5410: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;   UR5410: !1 = !{!"branch_weights", i32 290200493, i32 1857283155}
+;   UR5410: !2 = !{!"branch_weights", i32 1, i32 3}
+;
+; Original loop body frequency is 1 (loop weight 0).
+;
+;   RUN: sed -e s/@MAX@/5/ -e s/@W@/0/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;   RUN: %{bf-fc} ORIG5110
+;   RUN: %{ur-bf} -unroll-count=5 | %{fc} UR5110
+;   RUN: %{ur-bf} -unroll-count=6 | %{fc} UR5110
+;
+;   The sum of the new do.body* is approximately the old do.body.
+;   ORIG5110: - do.body: float = 1.0,
+;   UR5110:   - do.body: float = 1.0,
+;   UR5110:   - do.body.1: float = 0.0{{(0000[0-9]*)?}},
+;   UR5110:   - do.body.2: float = 0.0{{(0000[0-9]*)?}},
+;   UR5110:   - do.body.3: float = 0.0{{(0000[0-9]*)?}},
+;   UR5110:   - do.body.4: float = 0.0{{(0000[0-9]*)?}},
+;
+;   UR5110: call void @f
+;   UR5110: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;   UR5110: call void @f
+;   UR5110: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+;   UR5110: call void @f
+;   UR5110: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+;   UR5110: call void @f
+;   UR5110: br i1 %{{.*}}, label %do.end, label %do.body.4, !prof !0
+;   UR5110: call void @f
+;   UR5110: br label %do.end
+;   UR5110: !0 = !{!"branch_weights", i32 1, i32 0}
+
 declare void @f(i32)
 
 define void @test(i32 %x, i32 %n) {
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
index 09ecaebcf1f45..62675162544f8 100644
--- a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
@@ -138,6 +138,74 @@
 ; - It has no llvm.loop.estimated_trip_count.
 ; UR4-EUR: !6 = !{!"branch_weights", i32 1265493781, i32 881989867}
 
+; ------------------------------------------------------------------------------
+; Check -unroll-count=8.
+;
+; RUN: %{ur-bf} -unroll-count=8 | %{fc} UR8,UR8-ELP
+; RUN: %{ur-bf} -unroll-count=8 -unroll-remainder | \
+; RUN:   %{fc} UR8,UR8-EUR
+;
+; Multiply do.body by 8 and add do.body.epil* for either ELP or EUR to get the
+; original loop body frequency, 11.
+; UR8:     - do.body: float = 0.96188,
+; UR8-ELP: - do.body.epil: float = 3.3049,
+; UR8-EUR: - do.body.epil: float = 0.91256,
+; UR8-EUR: - do.body.epil.1: float = 0.7716,
+; UR8-EUR: - do.body.epil.2: float = 0.55854,
+; UR8-EUR: - do.body.epil.3: float = 0.40432,
+; UR8-EUR: - do.body.epil.4: float = 0.29268,
+; UR8-EUR: - do.body.epil.5: float = 0.21186,
+; UR8-EUR: - do.body.epil.6: float = 0.15336,
+;
+; Unrolled loop guard, body, and latch.
+; UR8: br i1 %{{.*}}, label %do.body.epil.preheader, label %entry.new, !prof !0
+; UR8-COUNT-8: call void @f
+; UR8: br i1 %{{.*}}, label %do.end.unr-lcssa, label %do.body, !prof !1, !llvm.loop !2
+;
+; Epilogue guard.
+; UR8: br i1 %{{.*}}, label %do.body.epil.preheader, label %do.end, !prof !5
+;
+; Non-unrolled epilogue loop.
+; UR8-ELP: call void @f
+; UR8-ELP: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
+;
+; Completely unrolled epilogue loop.
+; UR8-EUR: call void @f
+; UR8-EUR: br i1 %{{.*}}, label %do.body.epil.1, label %do.end.epilog-lcssa, !prof !6
+; UR8-EUR: call void @f
+; UR8-EUR: br i1 %{{.*}}, label %do.body.epil.2, label %do.end.epilog-lcssa, !prof !7
+; UR8-EUR: call void @f
+; UR8-EUR: br i1 %{{.*}}, label %do.body.epil.3, label %do.end.epilog-lcssa, !prof !7
+; UR8-EUR: call void @f
+; UR8-EUR: br i1 %{{.*}}, label %do.body.epil.4, label %do.end.epilog-lcssa, !prof !7
+; UR8-EUR: call void @f
+; UR8-EUR: br i1 %{{.*}}, label %do.body.epil.5, label %do.end.epilog-lcssa, !prof !7
+; UR8-EUR: call void @f
+; UR8-EUR: br i1 %{{.*}}, label %do.body.epil.6, label %do.end.epilog-lcssa, !prof !7
+; UR8-EUR: call void @f
+;
+; Unrolled loop metadata.
+; UR8: !0 = !{!"branch_weights", i32 1045484980, i32 1101998668}
+; UR8: !1 = !{!"branch_weights", i32 1145666677, i32 1001816971}
+; UR8: !2 = distinct !{!2, !3, !4}
+; UR8: !3 = !{!"llvm.loop.estimated_trip_count", i32 1}
+; UR8: !4 = !{!"llvm.loop.unroll.disable"}
+; UR8: !5 = !{!"branch_weights", i32 1781544591, i32 365939057}
+;
+; Non-unrolled epilogue loop metadata.
+; UR8-ELP: !6 = !{!"branch_weights", i32 1554520665, i32 592962983}
+; UR8-ELP: !7 = distinct !{!7, !8, !4}
+; UR8-ELP: !8 = !{!"llvm.loop.estimated_trip_count", i32 3}
+;
+; Completely unrolled epilogue loop metadata.  Because it loses its backedge:
+; - The remaining conditional latches' branch weights must be adjusted relative
+;   to the non-unrolled case.  There are many, so the implementation does not
+;   compute uniform branch weights.  Adjusting the first is sufficient, so the
+;   second is the same as the non-unrolled epilogue branch weights.
+; - It has no llvm.loop.estimated_trip_count.
+; UR8-EUR: !6 = !{!"branch_weights", i32 1815773828, i32 331709820}
+; UR8-EUR: !7 = !{!"branch_weights", i32 1554520665, i32 592962983}
+
 ; ------------------------------------------------------------------------------
 ; Check -unroll-count=10.
 ;
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll
index 09b2097d13582..a87c9390ee780 100644
--- a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll
@@ -133,6 +133,38 @@
 ;     MULT4: !1 = distinct !{!1, !2, !3}
 ;     MULT4: !2 = !{!"llvm.loop.estimated_trip_count", i32 3}
 ;     MULT4: !3 = !{!"llvm.loop.unroll.disable"}
+;
+;   -unroll-count=6, so there are 3 remaining conditional latches, the lowest
+;   number where the implementation cannot compute uniform weights using a
+;   simple formula.
+;
+;     RUN: %{ur-bf} -unroll-count=6 | %{fc} MULT6
+;
+;     Multiply by 2 and sum to get the original loop body frequency, 10.
+;     MULT6: - do.body: float = 2.1956,
+;     MULT6: - do.body.2: float = 1.476,
+;     MULT6: - do.body.4: float = 1.3284,
+;
+;     MULT6:       call void @f
+;     MULT6-NOT:   br
+;     MULT6:       call void @f
+;     MULT6:       br i1 %{{.*}}, label %do.body.2, label %do.end, !prof !0
+;     MULT6:       call void @f
+;     MULT6-NOT:   br
+;     MULT6:       call void @f
+;     MULT6:       br i1 %{{.*}}, label %do.body.4, label %do.end, !prof !1
+;     MULT6:       call void @f
+;     MULT6-NOT:   br
+;     MULT6:       call void @f
+;     MULT6:       br i1 %{{.*}}, label %do.body, label %do.end, !prof !1, !llvm.loop !2
+;
+;     There are 3 conditional latches remaining, so it adjusts the first and
+;     leaves the second two with the original loop's branch weights.
+;     MULT6: !0 = !{!"branch_weights", i32 1443686486, i32 703797162}
+;     MULT6: !1 = !{!"branch_weights", i32 9, i32 1}
+;     MULT6: !2 = distinct !{!2, !3, !4}
+;     MULT6: !3 = !{!"llvm.loop.estimated_trip_count", i32 2}
+;     MULT6: !4 = !{!"llvm.loop.unroll.disable"}
 
 ; ------------------------------------------------------------------------------
 ; Check case when the original loop's number of iterations is a run-time
@@ -193,6 +225,40 @@
 ;   LOW4: !1 = distinct !{!1, !2, !3}
 ;   LOW4: !2 = !{!"llvm.loop.estimated_trip_count", i32 1}
 ;   LOW4: !3 = !{!"llvm.loop.unroll.disable"}
+;
+; -unroll-count=6, so there are 3 remaining conditional latches.  The
+; implementation cannot compute uniform weights using a simple formula, and
+; ultimately it must set all those latches' probabilities to zero.  The
+; implementation will face a new stumbling block starting at the second latch:
+; reaching the remaining iterations already has a zero probability due to the
+; zero probability set at the first latch, so the required probability could
+; accidentally be computed as negative infinity.
+;
+;   RUN: %{ur-bf} -unroll-count=6 | %{fc} LOW6
+;
+;   Multiply by 2 and sum, but the result is greater than the original loop body
+;   frequency, 1, which is impossibly low.
+;   LOW6: - do.body: float = 1.0,
+;   LOW6: - do.body.2: float = 0.0{{(0000[0-9]*)?}},
+;   LOW6: - do.body.4: float = 0.0{{(0000[0-9]*)?}},
+;
+;   LOW6:     call void @f
+;   LOW6-NOT: br
+;   LOW6:     call void @f
+;   LOW6:     br i1 %{{.*}}, label %do.body.2, label %do.end, !prof !0
+;   LOW6:     call void @f
+;   LOW6-NOT: br
+;   LOW6:     call void @f
+;   LOW6:     br i1 %{{.*}}, label %do.body.4, label %do.end, !prof !0
+;   LOW6:     call void @f
+;   LOW6-NOT: br
+;   LOW6:     call void @f
+;   LOW6:     br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1
+;
+;   LOW6: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;   LOW6: !1 = distinct !{!1, !2, !3}
+;   LOW6: !2 = !{!"llvm.loop.estimated_trip_count", i32 1}
+;   LOW6: !3 = !{!"llvm.loop.unroll.disable"}
 
 ; ------------------------------------------------------------------------------
 ; Check cases when the original loop's number of iterations is a constant 10 and