[llvm-branch-commits] [llvm] [LoopUnroll] Fix freqs for unconditional latches: N>2, fast (PR #182404)

Thu Feb 19 15:52:39 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Joel E. Denny (jdenny-ornl)

<details>
<summary>Changes</summary>

This patch extends PR #179520 to the N > 2 case, where N is the number of remaining conditional latches.  Its strategy is to apply the original loop's probability to all N latches and then, as needed, adjust as few of them as possible.

---

Patch is 36.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182404.diff


4 Files Affected:

- (modified) llvm/lib/Transforms/Utils/LoopUnroll.cpp (+125-4) 
- (modified) llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll (+480) 
- (modified) llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll (+68) 
- (modified) llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll (+66) 


``````````diff

diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 529cbd3f5b5da..404e254c8a66f 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -492,9 +492,10 @@ static bool canHaveUnrollRemainder(const Loop *L) {
 // original loop iterations.
 //
 // There are often many sets of latch probabilities that can produce the
-// original total loop body frequency.  For now, this function computes uniform
-// probabilities when the number of remaining conditional latches is <= 2 and
-// does not handle other cases.
+// original total loop body frequency.  If there are many remaining conditional
+// latches, this function just quickly hacks a few of their probabilities to
+// restore the original total loop body frequency.  Otherwise, it determines
+// less arbitrary probabilities.
 static void fixProbContradiction(UnrollLoopOptions ULO,
                                  BranchProbability OriginalLoopProb,
                                  bool CompletelyUnroll,
@@ -557,6 +558,13 @@ static void fixProbContradiction(UnrollLoopOptions ULO,
   // FreqDesired is the frequency implied by the original loop probability.
   double FreqDesired = 1 / (1 - OriginalLoopProb.toDouble());
 
+  // Get the probability at CondLatches[I].
+  auto GetProb = [&](unsigned I) {
+    BranchInst *B = cast<BranchInst>(CondLatches[I]->getTerminator());
+    bool FirstTargetIsNext = B->getSuccessor(0) == CondLatchNexts[I];
+    return getBranchProbability(B, FirstTargetIsNext).toDouble();
+  };
+
   // Set the probability at CondLatches[I] to Prob.
   auto SetProb = [&](unsigned I, double Prob) {
     BranchInst *B = cast<BranchInst>(CondLatches[I]->getTerminator());
@@ -597,6 +605,12 @@ static void fixProbContradiction(UnrollLoopOptions ULO,
   // - For n <= 2, we can use simple formulas to solve the above polynomial
   //   equations exactly for p without performing a search.
 
+  // When iterating for a solution, we stop early if we find probabilities
+  // that produce a Freq whose difference from FreqDesired is small
+  // (FreqPrec).  Otherwise, we expect to compute a solution at least that
+  // accurate (but surely far more accurate).
+  const double FreqPrec = 1e-6;
+
   // Compute the probability that, used at CondLaches[0] where
   // CondLatches.size() == 1, gets as close as possible to FreqDesired.
   auto ComputeProbForLinear = [&]() {
@@ -624,13 +638,120 @@ static void fixProbContradiction(UnrollLoopOptions ULO,
     return Prob;
   };
 
+  // Compute the probability required at CondLatches[ComputeIdx] to get as close
+  // as possible to FreqDesired without replacing probabilities elsewhere in
+  // CondLatches.  Return {Prob, Freq} where 0 <= Prob <= 1 and Freq is the new
+  // frequency.
+  auto ComputeProb = [&](unsigned ComputeIdx) -> std::pair<double, double> {
+    assert(ComputeIdx < CondLatches.size());
+
+    // Accumulate the frequency from before ComputeIdx into FreqBeforeCompute,
+    // and accumulate the rest in Freq without yet multiplying the latter by any
+    // probability for ComputeIdx (i.e., treat it as 1 for now).
+    double ProbReaching = 1;     // p^0
+    double Freq = IterCounts[0]; // c_0*p^0
+    double FreqBeforeCompute;
+    for (unsigned I = 0, E = CondLatches.size(); I < E; ++I) {
+      // Get the branch probability for CondLatches[I].
+      double Prob;
+      if (I == ComputeIdx) {
+        FreqBeforeCompute = Freq;
+        Freq = 0;
+        Prob = 1;
+      } else {
+        Prob = GetProb(I);
+      }
+      ProbReaching *= Prob;                     // p^(I+1)
+      Freq += IterCounts[I + 1] * ProbReaching; // c_(I+1)*p^(I+1)
+    }
+
+    // Compute the required probability, and limit it to a valid probability (0
+    // <= p <= 1).  See the Freq formula below for how to derive the ProbCompute
+    // formula.
+    double ProbReachingBackedge = CompletelyUnroll ? 0 : ProbReaching;
+    double ProbComputeNumerator = FreqDesired - FreqBeforeCompute;
+    double ProbComputeDenominator = Freq + FreqDesired * ProbReachingBackedge;
+    double ProbCompute;
+    if (ProbComputeNumerator <= 0) {
+      // FreqBeforeCompute has already reached or surpassed FreqDesired, so add
+      // no more frequency.  It is possible that ProbComputeDenominator == 0
+      // here because some latch probability (maybe the original) was set to
+      // zero, so this check avoids setting ProbCompute=1 (in the else if below)
+      // and division by zero where the numerator <= 0 (in the else below).
+      ProbCompute = 0;
+    } else if (ProbComputeDenominator == 0) {
+      // Analytically, this case seems impossible.  It would occur if either:
+      // - Both Freq and FreqDesired are zero.  But the latter would cause
+      //   ProbComputeNumerator < 0, which we catch above, and FreqDesired
+      //   should always be >= 1 anyway.
+      // - There are no iterations after CondLatches[ComputeIdx], not even via
+      //   a backedge, so that both Freq and ProbReachingBackedge are zero.
+      //   But iterations should exist after even the last conditional latch.
+      // - Some latch probability (maybe the original) was set to zero so that
+      //   both Freq and ProbReachingBackedge are zero.  But that should not
+      //   have happened because, according to the above ProbComputeNumerator
+      //   check, we have not yet reached FreqDesired (which, if the original
+      //   latch probability is zero, is just 1 and thus always reached or
+      //   surpassed).
+      //
+      // Numerically, perhaps this case is possible.  We interpret it to mean we
+      // need more frequency (ProbComputeNumerator > 0) but have no way to get
+      // any (ProbComputeDenominator is analytically too small to distinguish it
+      // from 0 in floating point), suggesting infinite probability is needed,
+      // but 1 is the maximum valid probability and thus the best we can do.
+      //
+      // TODO: Cover this case in the test suite if you can.
+      ProbCompute = 1;
+    } else {
+      ProbCompute = ProbComputeNumerator / ProbComputeDenominator;
+      ProbCompute = std::max(ProbCompute, 0.);
+      ProbCompute = std::min(ProbCompute, 1.);
+    }
+
+    // Compute the resulting total frequency.
+    if (ProbReachingBackedge * ProbCompute == 1) {
+      // Analytically, this case seems impossible.  It requires that there is a
+      // backedge and that FreqDesired == infinity so that every conditional
+      // latch's probability had to be set to 1.  But FreqDesired == infinity
+      // means OriginalLoopProb.isOne(), which we guarded against earlier.
+      //
+      // Numerically, perhaps this case is possible.  We interpret it to mean
+      // that analytically the probability has to be so near 1 that, in floating
+      // point, the frequency is computed as infinite.
+      //
+      // TODO: Cover this case in the test suite if you can.
+      Freq = std::numeric_limits<double>::infinity();
+    } else {
+      assert(FreqBeforeCompute > 0 &&
+             "Expected at least one iteration before first latch");
+      // In this equation, if we replace the left-hand side with FreqDesired and
+      // then solve for ProbCompute, we get the ProbCompute formula above.
+      Freq = (FreqBeforeCompute + Freq * ProbCompute) /
+             (1 - ProbReachingBackedge * ProbCompute);
+    }
+    return {ProbCompute, Freq};
+  };
+
   // Determine and set branch weights.
   if (CondLatches.size() == 1) {
     SetAllProbs(ComputeProbForLinear());
   } else if (CondLatches.size() == 2) {
     SetAllProbs(ComputeProbForQuadratic());
   } else {
-    // FIXME: Handle CondLatches.size() > 2.
+    // The polynomial is too complex for a simple formula, so the quick and
+    // dirty fix has been selected.  Adjust probabilities starting from the
+    // first latch, which has the most influence on the total frequency, so
+    // starting there should minimize the number of latches that have to be
+    // visited.  We do have to iterate because the first latch alone might not
+    // be enough.  For example, we might need to set all probabilities to 1 if
+    // the frequency is the unroll factor.
+    for (unsigned I = 0; I != CondLatches.size(); ++I) {
+      double Prob, Freq;
+      std::tie(Prob, Freq) = ComputeProb(I);
+      SetProb(I, Prob);
+      if (fabs(Freq - FreqDesired) < FreqPrec)
+        break;
+    }
   }
 
   // FIXME: We have not considered non-latch loop exits:
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
index 3d87ee185b554..353e74be9fbd1 100644
--- a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
@@ -491,6 +491,486 @@
 ;     UR313x:     br label %do.end
 ;     UR313x:     !0 = !{!"branch_weights", i32 -2147483648, i32 0}
 
+; ------------------------------------------------------------------------------
+; Check 4 max iterations:
+; - Unroll count of >=4 should always produce complete unrolling.
+; - That produces <=3 unrolled iteration latches.  3 is the lowest number where
+;   the implementation cannot compute uniform weights using a simple formula.
+;
+; Original loop body frequency is 5 (loop weight 4), which is impossibly high.
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/4/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4510
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4510
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4510
+;
+;     The sum of the new do.body* cannot reach the old do.body, which is
+;     impossibly high.
+;     ORIG4510: - do.body: float = 5.0,
+;     UR4510:   - do.body: float = 1.0,
+;     UR4510:   - do.body.1: float = 1.0,
+;     UR4510:   - do.body.2: float = 1.0,
+;     UR4510:   - do.body.3: float = 1.0,
+;
+;     The probabilities are maximized to try to reach the original frequency.
+;     UR4510: call void @f
+;     UR4510: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR4510: call void @f
+;     UR4510: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+;     UR4510: call void @f
+;     UR4510: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+;     UR4510: call void @f
+;     UR4510: br label %do.end
+;     UR4510: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/4/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4540
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4540
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4540
+;
+;     The new do.body contains 4 of the original loop's iterations, so multiply
+;     it by 4, which is less than the old do.body, which is impossibly high.
+;     ORIG4540: - do.body: float = 5.0,
+;     UR4540:   - do.body: float = 1.0,
+;
+;     UR4540:     call void @f
+;     UR4540-NOT: br
+;     UR4540:     call void @f
+;     UR4540-NOT: br
+;     UR4540:     call void @f
+;     UR4540-NOT: br
+;     UR4540:     call void @f
+;     UR4540:     ret void
+;
+;   Use a constant iteration count but now the loop upper bound computation can
+;   overflow.  When it does, the loop induction variable is greater than it
+;   immediately, so the initial unrolled iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/4/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG454x
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR454x
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR454x
+;
+;     The new do.body.1 contains 3 of the original loop's iterations, so
+;     multiply it by 3, and add the new do.body, but that sum is less than the
+;     old do.body, which is impossibly high.
+;     ORIG454x: - do.body: float = 5.0,
+;     UR454x:   - do.body: float = 1.0,
+;     UR454x:   - do.body.1: float = 1.0,
+;
+;     The sole probability is maximized to try to reach the original frequency.
+;     UR454x:     call void @f
+;     UR454x:     br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR454x:     call void @f
+;     UR454x-NOT: br
+;     UR454x:     call void @f
+;     UR454x-NOT: br
+;     UR454x:     call void @f
+;     UR454x:     br label %do.end
+;     UR454x:     !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Original loop body frequency is 4 (loop weight 3).
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/3/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4410
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4410
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4410
+;
+;     The sum of the new do.body* is the old do.body.
+;     ORIG4410: - do.body: float = 4.0,
+;     UR4410:   - do.body: float = 1.0,
+;     UR4410:   - do.body.1: float = 1.0,
+;     UR4410:   - do.body.2: float = 1.0,
+;     UR4410:   - do.body.3: float = 1.0,
+;
+;     UR4410: call void @f
+;     UR4410: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR4410: call void @f
+;     UR4410: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+;     UR4410: call void @f
+;     UR4410: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+;     UR4410: call void @f
+;     UR4410: br label %do.end
+;     UR4410: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/3/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4440
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4440
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4440
+;
+;     The new do.body contains 4 of the original loop's iterations, so multiply
+;     it by 4 to get the old do.body.
+;     ORIG4440: - do.body: float = 4.0,
+;     UR4440:   - do.body: float = 1.0,
+;
+;     UR4440:     call void @f
+;     UR4440-NOT: br
+;     UR4440:     call void @f
+;     UR4440-NOT: br
+;     UR4440:     call void @f
+;     UR4440-NOT: br
+;     UR4440:     call void @f
+;     UR4440:     ret void
+;
+;   Use a constant iteration count but now the loop upper bound computation can
+;   overflow.  When it does, the loop induction variable is greater than it
+;   immediately, so the initial unrolled iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/3/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG444x
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR444x
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR444x
+;
+;     The new do.body.1 contains 3 of the original loop's iterations, so
+;     multiply it by 3, and add the new do.body to get the old do.body.
+;     ORIG444x: - do.body: float = 4.0,
+;     UR444x:   - do.body: float = 1.0,
+;     UR444x:   - do.body.1: float = 1.0,
+;
+;     UR444x:     call void @f
+;     UR444x:     br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR444x:     call void @f
+;     UR444x-NOT: br
+;     UR444x:     call void @f
+;     UR444x-NOT: br
+;     UR444x:     call void @f
+;     UR444x:     br label %do.end
+;     UR444x:     !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Original loop body frequency is 3 (loop weight 2).  This is our first case
+; where the new probabilities vary.
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/2/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4310
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4310
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4310
+;
+;     The sum of the new do.body* is always approximately the old do.body.
+;     ORIG4310: - do.body: float = 3.0,
+;     UR4310: - do.body: float = 1.0,
+;     UR4310: - do.body.1: float = 0.94737,
+;     UR4310: - do.body.2: float = 0.63158,
+;     UR4310: - do.body.3: float = 0.42105,
+;
+;     UR4310:  call void @f
+;     UR4310:  br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR4310:  call void @f
+;     UR4310:  br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !1
+;     UR4310:  call void @f
+;     UR4310:  br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !1
+;     UR4310:  call void @f
+;     UR4310:  br label %do.end
+;     UR4310:  !0 = !{!"branch_weights", i32 113025456, i32 2034458192}
+;     UR4310:  !1 = !{!"branch_weights", i32 1, i32 2}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/2/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4340
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4340
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4340
+;
+;     The new do.body contains 4 of the original loop's iterations, so multiply
+;     it by 4, which is greater than the old do.body, which is impossibly low.
+;     ORIG4340: - do.body: float = 3.0,
+;     UR4340:   - do.body: float = 1.0,
+;
+;     UR4340:     call void @f
+;     UR4340-NOT: br
+;     UR4340:     call void @f
+;     UR4340-NOT: br
+;     UR4340:     call void @f
+;     UR4340-NOT: br
+;     UR4340:     call void @f
+;     UR4340:     ret void
+;
+;   Use a constant iteration count but now the loop upper bound computation can
+;   overflow.  When it does, the loop induction variable is greater than it
+;   immediately, so the initial unrolled iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/2/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG434x
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR434x
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR434x
+;
+;     The new do.body.1 contains 3 of the original loop's iterations, so
+;     multiply it by 3, and add the new do.body to get the old do.body.
+;     ORIG434x: - do.body: float = 3.0,
+;     UR434x:   - do.body: float = 1.0,
+;     UR434x:   - do.body.1: float = 0.66667,
+;
+;     UR434x:     call void @f
+;     UR434x:     br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR434x:     call void @f
+;     UR434x-NOT: br
+;     UR434x:     call void @f
+;     UR434x-NOT: br
+;     UR434x:     call void @f
+;     UR434x:     br label %do.end
+;     UR434x:     !0 = !{!"branch_weights", i32 715827884, i32 1431655764}
+;
+; Original loop body frequency is 2 (loop weight 1).
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/1/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4210
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4210
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4210
+;
+;     The sum of the new do.body* is always the old do.body.
+;     ORIG4210: - do.body: float = 2.0,
+;     UR4210: - do.body: float = 1.0,
+;     UR4210: - do.body.1: float = 0.57143,
+;     UR4210: - do.body.2: float = 0.28571,
+;     UR4210: - do.body.3: float = 0.14286,
+;
+;     UR4210: call void @f
+;     UR4210: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR4210: call void @f
+;     UR4210: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !1
+;     UR4210: call void @f
+;     UR4210: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !1
+;     UR4210: call void @f
+;     UR4210: br label %do.end
+;     UR4210: !0 = !{!"branch_weights", i32 920350135, i32 1227133513}
+;     UR4210: !1 = !{!"branch_weights", i32 1, i32 1}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/4/ -e s/@W@/1/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG4240
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4240
+;     RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4240
+;
+;     The new do.body contains 4 of the original loop's iterations, so multiply
+;     it by 4, which is greater than the old do.body, which is impossibly low.
+;     ORIG4240: - do.body: float = 2.0,
+;  ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/182404