[llvm-branch-commits] [llvm] [LoopUnroll] Fix freqs for unconditional latches: N>2, fast (PR #182404)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Feb 19 15:52:39 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Joel E. Denny (jdenny-ornl)
<details>
<summary>Changes</summary>
This patch extends PR #<!-- -->179520 to the N > 2 case, where N is the number of remaining conditional latches. Its strategy is to apply the original loop's probability to all N latches and then, as needed, adjust as few of them as possible.
---
Patch is 36.05 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182404.diff
4 Files Affected:
- (modified) llvm/lib/Transforms/Utils/LoopUnroll.cpp (+125-4)
- (modified) llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll (+480)
- (modified) llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll (+68)
- (modified) llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll (+66)
``````````diff
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 529cbd3f5b5da..404e254c8a66f 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -492,9 +492,10 @@ static bool canHaveUnrollRemainder(const Loop *L) {
// original loop iterations.
//
// There are often many sets of latch probabilities that can produce the
-// original total loop body frequency. For now, this function computes uniform
-// probabilities when the number of remaining conditional latches is <= 2 and
-// does not handle other cases.
+// original total loop body frequency. If there are many remaining conditional
+// latches, this function just quickly hacks a few of their probabilities to
+// restore the original total loop body frequency. Otherwise, it determines
+// less arbitrary probabilities.
static void fixProbContradiction(UnrollLoopOptions ULO,
BranchProbability OriginalLoopProb,
bool CompletelyUnroll,
@@ -557,6 +558,13 @@ static void fixProbContradiction(UnrollLoopOptions ULO,
// FreqDesired is the frequency implied by the original loop probability.
double FreqDesired = 1 / (1 - OriginalLoopProb.toDouble());
+ // Get the probability at CondLatches[I].
+ auto GetProb = [&](unsigned I) {
+ BranchInst *B = cast<BranchInst>(CondLatches[I]->getTerminator());
+ bool FirstTargetIsNext = B->getSuccessor(0) == CondLatchNexts[I];
+ return getBranchProbability(B, FirstTargetIsNext).toDouble();
+ };
+
// Set the probability at CondLatches[I] to Prob.
auto SetProb = [&](unsigned I, double Prob) {
BranchInst *B = cast<BranchInst>(CondLatches[I]->getTerminator());
@@ -597,6 +605,12 @@ static void fixProbContradiction(UnrollLoopOptions ULO,
// - For n <= 2, we can use simple formulas to solve the above polynomial
// equations exactly for p without performing a search.
+ // When iterating for a solution, we stop early if we find probabilities
+ // that produce a Freq whose difference from FreqDesired is small
+ // (FreqPrec). Otherwise, we expect to compute a solution at least that
+ // accurate (but surely far more accurate).
+ const double FreqPrec = 1e-6;
+
// Compute the probability that, used at CondLaches[0] where
// CondLatches.size() == 1, gets as close as possible to FreqDesired.
auto ComputeProbForLinear = [&]() {
@@ -624,13 +638,120 @@ static void fixProbContradiction(UnrollLoopOptions ULO,
return Prob;
};
+ // Compute the probability required at CondLatches[ComputeIdx] to get as close
+ // as possible to FreqDesired without replacing probabilities elsewhere in
+ // CondLatches. Return {Prob, Freq} where 0 <= Prob <= 1 and Freq is the new
+ // frequency.
+ auto ComputeProb = [&](unsigned ComputeIdx) -> std::pair<double, double> {
+ assert(ComputeIdx < CondLatches.size());
+
+ // Accumulate the frequency from before ComputeIdx into FreqBeforeCompute,
+ // and accumulate the rest in Freq without yet multiplying the latter by any
+ // probability for ComputeIdx (i.e., treat it as 1 for now).
+ double ProbReaching = 1; // p^0
+ double Freq = IterCounts[0]; // c_0*p^0
+ double FreqBeforeCompute;
+ for (unsigned I = 0, E = CondLatches.size(); I < E; ++I) {
+ // Get the branch probability for CondLatches[I].
+ double Prob;
+ if (I == ComputeIdx) {
+ FreqBeforeCompute = Freq;
+ Freq = 0;
+ Prob = 1;
+ } else {
+ Prob = GetProb(I);
+ }
+ ProbReaching *= Prob; // p^(I+1)
+ Freq += IterCounts[I + 1] * ProbReaching; // c_(I+1)*p^(I+1)
+ }
+
+ // Compute the required probability, and limit it to a valid probability (0
+ // <= p <= 1). See the Freq formula below for how to derive the ProbCompute
+ // formula.
+ double ProbReachingBackedge = CompletelyUnroll ? 0 : ProbReaching;
+ double ProbComputeNumerator = FreqDesired - FreqBeforeCompute;
+ double ProbComputeDenominator = Freq + FreqDesired * ProbReachingBackedge;
+ double ProbCompute;
+ if (ProbComputeNumerator <= 0) {
+ // FreqBeforeCompute has already reached or surpassed FreqDesired, so add
+ // no more frequency. It is possible that ProbComputeDenominator == 0
+ // here because some latch probability (maybe the original) was set to
+ // zero, so this check avoids setting ProbCompute=1 (in the else if below)
+ // and division by zero where the numerator <= 0 (in the else below).
+ ProbCompute = 0;
+ } else if (ProbComputeDenominator == 0) {
+ // Analytically, this case seems impossible. It would occur if either:
+ // - Both Freq and FreqDesired are zero. But the latter would cause
+ // ProbComputeNumerator < 0, which we catch above, and FreqDesired
+ // should always be >= 1 anyway.
+ // - There are no iterations after CondLatches[ComputeIdx], not even via
+ // a backedge, so that both Freq and ProbReachingBackedge are zero.
+ // But iterations should exist after even the last conditional latch.
+ // - Some latch probability (maybe the original) was set to zero so that
+ // both Freq and ProbReachingBackedge are zero. But that should not
+ // have happened because, according to the above ProbComputeNumerator
+ // check, we have not yet reached FreqDesired (which, if the original
+ // latch probability is zero, is just 1 and thus always reached or
+ // surpassed).
+ //
+ // Numerically, perhaps this case is possible. We interpret it to mean we
+ // need more frequency (ProbComputeNumerator > 0) but have no way to get
+ // any (ProbComputeDenominator is analytically too small to distinguish it
+ // from 0 in floating point), suggesting infinite probability is needed,
+ // but 1 is the maximum valid probability and thus the best we can do.
+ //
+ // TODO: Cover this case in the test suite if you can.
+ ProbCompute = 1;
+ } else {
+ ProbCompute = ProbComputeNumerator / ProbComputeDenominator;
+ ProbCompute = std::max(ProbCompute, 0.);
+ ProbCompute = std::min(ProbCompute, 1.);
+ }
+
+ // Compute the resulting total frequency.
+ if (ProbReachingBackedge * ProbCompute == 1) {
+ // Analytically, this case seems impossible. It requires that there is a
+ // backedge and that FreqDesired == infinity so that every conditional
+ // latch's probability had to be set to 1. But FreqDesired == infinity
+ // means OriginalLoopProb.isOne(), which we guarded against earlier.
+ //
+ // Numerically, perhaps this case is possible. We interpret it to mean
+ // that analytically the probability has to be so near 1 that, in floating
+ // point, the frequency is computed as infinite.
+ //
+ // TODO: Cover this case in the test suite if you can.
+ Freq = std::numeric_limits<double>::infinity();
+ } else {
+ assert(FreqBeforeCompute > 0 &&
+ "Expected at least one iteration before first latch");
+ // In this equation, if we replace the left-hand side with FreqDesired and
+ // then solve for ProbCompute, we get the ProbCompute formula above.
+ Freq = (FreqBeforeCompute + Freq * ProbCompute) /
+ (1 - ProbReachingBackedge * ProbCompute);
+ }
+ return {ProbCompute, Freq};
+ };
+
// Determine and set branch weights.
if (CondLatches.size() == 1) {
SetAllProbs(ComputeProbForLinear());
} else if (CondLatches.size() == 2) {
SetAllProbs(ComputeProbForQuadratic());
} else {
- // FIXME: Handle CondLatches.size() > 2.
+ // The polynomial is too complex for a simple formula, so the quick and
+ // dirty fix has been selected. Adjust probabilities starting from the
+ // first latch, which has the most influence on the total frequency, so
+ // starting there should minimize the number of latches that have to be
+ // visited. We do have to iterate because the first latch alone might not
+ // be enough. For example, we might need to set all probabilities to 1 if
+ // the frequency is the unroll factor.
+ for (unsigned I = 0; I != CondLatches.size(); ++I) {
+ double Prob, Freq;
+ std::tie(Prob, Freq) = ComputeProb(I);
+ SetProb(I, Prob);
+ if (fabs(Freq - FreqDesired) < FreqPrec)
+ break;
+ }
}
// FIXME: We have not considered non-latch loop exits:
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
index 3d87ee185b554..353e74be9fbd1 100644
--- a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
@@ -491,6 +491,486 @@
; UR313x: br label %do.end
; UR313x: !0 = !{!"branch_weights", i32 -2147483648, i32 0}
+; ------------------------------------------------------------------------------
+; Check 4 max iterations:
+; - Unroll count of >=4 should always produce complete unrolling.
+; - That produces <=3 unrolled iteration latches. 3 is the lowest number where
+; the implementation cannot compute uniform weights using a simple formula.
+;
+; Original loop body frequency is 5 (loop weight 4), which is impossibly high.
+;
+; First use a variable iteration count so that all non-final unrolled
+; iterations' latches remain conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/4/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4510
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4510
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4510
+;
+; The sum of the new do.body* cannot reach the old do.body, which is
+; impossibly high.
+; ORIG4510: - do.body: float = 5.0,
+; UR4510: - do.body: float = 1.0,
+; UR4510: - do.body.1: float = 1.0,
+; UR4510: - do.body.2: float = 1.0,
+; UR4510: - do.body.3: float = 1.0,
+;
+; The probabilities are maximized to try to reach the original frequency.
+; UR4510: call void @f
+; UR4510: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR4510: call void @f
+; UR4510: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+; UR4510: call void @f
+; UR4510: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+; UR4510: call void @f
+; UR4510: br label %do.end
+; UR4510: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Now use a constant iteration count so that all non-final unrolled
+; iterations' latches unconditionally continue.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/4/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4540
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4540
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4540
+;
+; The new do.body contains 4 of the original loop's iterations, so multiply
+; it by 4, which is less than the old do.body, which is impossibly high.
+; ORIG4540: - do.body: float = 5.0,
+; UR4540: - do.body: float = 1.0,
+;
+; UR4540: call void @f
+; UR4540-NOT: br
+; UR4540: call void @f
+; UR4540-NOT: br
+; UR4540: call void @f
+; UR4540-NOT: br
+; UR4540: call void @f
+; UR4540: ret void
+;
+; Use a constant iteration count but now the loop upper bound computation can
+; overflow. When it does, the loop induction variable is greater than it
+; immediately, so the initial unrolled iteration's latch remains conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/4/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+; RUN: %{bf-fc} ORIG454x
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR454x
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR454x
+;
+; The new do.body.1 contains 3 of the original loop's iterations, so
+; multiply it by 3, and add the new do.body, but that sum is less than the
+; old do.body, which is impossibly high.
+; ORIG454x: - do.body: float = 5.0,
+; UR454x: - do.body: float = 1.0,
+; UR454x: - do.body.1: float = 1.0,
+;
+; The sole probability is maximized to try to reach the original frequency.
+; UR454x: call void @f
+; UR454x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR454x: call void @f
+; UR454x-NOT: br
+; UR454x: call void @f
+; UR454x-NOT: br
+; UR454x: call void @f
+; UR454x: br label %do.end
+; UR454x: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Original loop body frequency is 4 (loop weight 3).
+;
+; First use a variable iteration count so that all non-final unrolled
+; iterations' latches remain conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/3/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4410
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4410
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4410
+;
+; The sum of the new do.body* is the old do.body.
+; ORIG4410: - do.body: float = 4.0,
+; UR4410: - do.body: float = 1.0,
+; UR4410: - do.body.1: float = 1.0,
+; UR4410: - do.body.2: float = 1.0,
+; UR4410: - do.body.3: float = 1.0,
+;
+; UR4410: call void @f
+; UR4410: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR4410: call void @f
+; UR4410: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+; UR4410: call void @f
+; UR4410: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+; UR4410: call void @f
+; UR4410: br label %do.end
+; UR4410: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Now use a constant iteration count so that all non-final unrolled
+; iterations' latches unconditionally continue.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/3/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4440
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4440
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4440
+;
+; The new do.body contains 4 of the original loop's iterations, so multiply
+; it by 4 to get the old do.body.
+; ORIG4440: - do.body: float = 4.0,
+; UR4440: - do.body: float = 1.0,
+;
+; UR4440: call void @f
+; UR4440-NOT: br
+; UR4440: call void @f
+; UR4440-NOT: br
+; UR4440: call void @f
+; UR4440-NOT: br
+; UR4440: call void @f
+; UR4440: ret void
+;
+; Use a constant iteration count but now the loop upper bound computation can
+; overflow. When it does, the loop induction variable is greater than it
+; immediately, so the initial unrolled iteration's latch remains conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/3/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+; RUN: %{bf-fc} ORIG444x
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR444x
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR444x
+;
+; The new do.body.1 contains 3 of the original loop's iterations, so
+; multiply it by 3, and add the new do.body to get the old do.body.
+; ORIG444x: - do.body: float = 4.0,
+; UR444x: - do.body: float = 1.0,
+; UR444x: - do.body.1: float = 1.0,
+;
+; UR444x: call void @f
+; UR444x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR444x: call void @f
+; UR444x-NOT: br
+; UR444x: call void @f
+; UR444x-NOT: br
+; UR444x: call void @f
+; UR444x: br label %do.end
+; UR444x: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Original loop body frequency is 3 (loop weight 2). This is our first case
+; where the new probabilities vary.
+;
+; First use a variable iteration count so that all non-final unrolled
+; iterations' latches remain conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/2/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4310
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4310
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4310
+;
+; The sum of the new do.body* is always approximately the old do.body.
+; ORIG4310: - do.body: float = 3.0,
+; UR4310: - do.body: float = 1.0,
+; UR4310: - do.body.1: float = 0.94737,
+; UR4310: - do.body.2: float = 0.63158,
+; UR4310: - do.body.3: float = 0.42105,
+;
+; UR4310: call void @f
+; UR4310: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR4310: call void @f
+; UR4310: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !1
+; UR4310: call void @f
+; UR4310: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !1
+; UR4310: call void @f
+; UR4310: br label %do.end
+; UR4310: !0 = !{!"branch_weights", i32 113025456, i32 2034458192}
+; UR4310: !1 = !{!"branch_weights", i32 1, i32 2}
+;
+; Now use a constant iteration count so that all non-final unrolled
+; iterations' latches unconditionally continue.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/2/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4340
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4340
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4340
+;
+; The new do.body contains 4 of the original loop's iterations, so multiply
+; it by 4, which is greater than the old do.body, which is impossibly low.
+; ORIG4340: - do.body: float = 3.0,
+; UR4340: - do.body: float = 1.0,
+;
+; UR4340: call void @f
+; UR4340-NOT: br
+; UR4340: call void @f
+; UR4340-NOT: br
+; UR4340: call void @f
+; UR4340-NOT: br
+; UR4340: call void @f
+; UR4340: ret void
+;
+; Use a constant iteration count but now the loop upper bound computation can
+; overflow. When it does, the loop induction variable is greater than it
+; immediately, so the initial unrolled iteration's latch remains conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/2/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+; RUN: %{bf-fc} ORIG434x
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR434x
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR434x
+;
+; The new do.body.1 contains 3 of the original loop's iterations, so
+; multiply it by 3, and add the new do.body to get the old do.body.
+; ORIG434x: - do.body: float = 3.0,
+; UR434x: - do.body: float = 1.0,
+; UR434x: - do.body.1: float = 0.66667,
+;
+; UR434x: call void @f
+; UR434x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR434x: call void @f
+; UR434x-NOT: br
+; UR434x: call void @f
+; UR434x-NOT: br
+; UR434x: call void @f
+; UR434x: br label %do.end
+; UR434x: !0 = !{!"branch_weights", i32 715827884, i32 1431655764}
+;
+; Original loop body frequency is 2 (loop weight 1).
+;
+; First use a variable iteration count so that all non-final unrolled
+; iterations' latches remain conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/1/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4210
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4210
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4210
+;
+; The sum of the new do.body* is always the old do.body.
+; ORIG4210: - do.body: float = 2.0,
+; UR4210: - do.body: float = 1.0,
+; UR4210: - do.body.1: float = 0.57143,
+; UR4210: - do.body.2: float = 0.28571,
+; UR4210: - do.body.3: float = 0.14286,
+;
+; UR4210: call void @f
+; UR4210: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR4210: call void @f
+; UR4210: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !1
+; UR4210: call void @f
+; UR4210: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !1
+; UR4210: call void @f
+; UR4210: br label %do.end
+; UR4210: !0 = !{!"branch_weights", i32 920350135, i32 1227133513}
+; UR4210: !1 = !{!"branch_weights", i32 1, i32 1}
+;
+; Now use a constant iteration count so that all non-final unrolled
+; iterations' latches unconditionally continue.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/1/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4240
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4240
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4240
+;
+; The new do.body contains 4 of the original loop's iterations, so multiply
+; it by 4, which is greater than the old do.body, which is impossibly low.
+; ORIG4240: - do.body: float = 2.0,
+; ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/182404
More information about the llvm-branch-commits
mailing list