[llvm-branch-commits] [llvm] [LoopUnroll] Fix freqs for unconditional latches: N>2, fast (PR #182404)
Joel E. Denny via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Feb 26 15:54:58 PST 2026
https://github.com/jdenny-ornl updated https://github.com/llvm/llvm-project/pull/182404
>From 06c90370c6d76411be29c2a9dac35bd2f89a5f0f Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Thu, 19 Feb 2026 17:38:13 -0500
Subject: [PATCH] [LoopUnroll] Fix freqs for unconditional latches: N>2, fast
This patch extends PR #179520 to the N > 2 case, where N is the number
of remaining conditional latches. Its strategy is to apply the
original loop's probability to all N latches and then, as needed,
adjust as few of them as possible.
---
llvm/lib/Transforms/Utils/LoopUnroll.cpp | 129 ++++-
.../branch-weights-freq/unroll-complete.ll | 480 ++++++++++++++++++
.../branch-weights-freq/unroll-epilog.ll | 68 +++
.../unroll-partial-unconditional-latch.ll | 66 +++
4 files changed, 739 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 529cbd3f5b5da..404e254c8a66f 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -492,9 +492,10 @@ static bool canHaveUnrollRemainder(const Loop *L) {
// original loop iterations.
//
// There are often many sets of latch probabilities that can produce the
-// original total loop body frequency. For now, this function computes uniform
-// probabilities when the number of remaining conditional latches is <= 2 and
-// does not handle other cases.
+// original total loop body frequency. If there are many remaining conditional
+// latches, this function just quickly hacks a few of their probabilities to
+// restore the original total loop body frequency. Otherwise, it determines
+// less arbitrary probabilities.
static void fixProbContradiction(UnrollLoopOptions ULO,
BranchProbability OriginalLoopProb,
bool CompletelyUnroll,
@@ -557,6 +558,13 @@ static void fixProbContradiction(UnrollLoopOptions ULO,
// FreqDesired is the frequency implied by the original loop probability.
double FreqDesired = 1 / (1 - OriginalLoopProb.toDouble());
+ // Get the probability at CondLatches[I].
+ auto GetProb = [&](unsigned I) {
+ BranchInst *B = cast<BranchInst>(CondLatches[I]->getTerminator());
+ bool FirstTargetIsNext = B->getSuccessor(0) == CondLatchNexts[I];
+ return getBranchProbability(B, FirstTargetIsNext).toDouble();
+ };
+
// Set the probability at CondLatches[I] to Prob.
auto SetProb = [&](unsigned I, double Prob) {
BranchInst *B = cast<BranchInst>(CondLatches[I]->getTerminator());
@@ -597,6 +605,12 @@ static void fixProbContradiction(UnrollLoopOptions ULO,
// - For n <= 2, we can use simple formulas to solve the above polynomial
// equations exactly for p without performing a search.
+ // When iterating for a solution, we stop early if we find probabilities
+ // that produce a Freq whose difference from FreqDesired is small
+ // (FreqPrec). Otherwise, we expect to compute a solution at least that
+ // accurate (but surely far more accurate).
+ const double FreqPrec = 1e-6;
+
// Compute the probability that, used at CondLaches[0] where
// CondLatches.size() == 1, gets as close as possible to FreqDesired.
auto ComputeProbForLinear = [&]() {
@@ -624,13 +638,120 @@ static void fixProbContradiction(UnrollLoopOptions ULO,
return Prob;
};
+ // Compute the probability required at CondLatches[ComputeIdx] to get as close
+ // as possible to FreqDesired without replacing probabilities elsewhere in
+ // CondLatches. Return {Prob, Freq} where 0 <= Prob <= 1 and Freq is the new
+ // frequency.
+ auto ComputeProb = [&](unsigned ComputeIdx) -> std::pair<double, double> {
+ assert(ComputeIdx < CondLatches.size());
+
+ // Accumulate the frequency from before ComputeIdx into FreqBeforeCompute,
+ // and accumulate the rest in Freq without yet multiplying the latter by any
+ // probability for ComputeIdx (i.e., treat it as 1 for now).
+ double ProbReaching = 1; // p^0
+ double Freq = IterCounts[0]; // c_0*p^0
+ double FreqBeforeCompute;
+ for (unsigned I = 0, E = CondLatches.size(); I < E; ++I) {
+ // Get the branch probability for CondLatches[I].
+ double Prob;
+ if (I == ComputeIdx) {
+ FreqBeforeCompute = Freq;
+ Freq = 0;
+ Prob = 1;
+ } else {
+ Prob = GetProb(I);
+ }
+ ProbReaching *= Prob; // p^(I+1)
+ Freq += IterCounts[I + 1] * ProbReaching; // c_(I+1)*p^(I+1)
+ }
+
+ // Compute the required probability, and limit it to a valid probability (0
+ // <= p <= 1). See the Freq formula below for how to derive the ProbCompute
+ // formula.
+ double ProbReachingBackedge = CompletelyUnroll ? 0 : ProbReaching;
+ double ProbComputeNumerator = FreqDesired - FreqBeforeCompute;
+ double ProbComputeDenominator = Freq + FreqDesired * ProbReachingBackedge;
+ double ProbCompute;
+ if (ProbComputeNumerator <= 0) {
+ // FreqBeforeCompute has already reached or surpassed FreqDesired, so add
+ // no more frequency. It is possible that ProbComputeDenominator == 0
+ // here because some latch probability (maybe the original) was set to
+ // zero, so this check avoids setting ProbCompute=1 (in the else if below)
+ // and division by zero where the numerator <= 0 (in the else below).
+ ProbCompute = 0;
+ } else if (ProbComputeDenominator == 0) {
+ // Analytically, this case seems impossible. It would occur if either:
+ // - Both Freq and FreqDesired are zero. But the latter would cause
+ // ProbComputeNumerator < 0, which we catch above, and FreqDesired
+ // should always be >= 1 anyway.
+ // - There are no iterations after CondLatches[ComputeIdx], not even via
+ // a backedge, so that both Freq and ProbReachingBackedge are zero.
+ // But iterations should exist after even the last conditional latch.
+ // - Some latch probability (maybe the original) was set to zero so that
+ // both Freq and ProbReachingBackedge are zero. But that should not
+ // have happened because, according to the above ProbComputeNumerator
+ // check, we have not yet reached FreqDesired (which, if the original
+ // latch probability is zero, is just 1 and thus always reached or
+ // surpassed).
+ //
+ // Numerically, perhaps this case is possible. We interpret it to mean we
+ // need more frequency (ProbComputeNumerator > 0) but have no way to get
+ // any (ProbComputeDenominator is analytically too small to distinguish it
+ // from 0 in floating point), suggesting infinite probability is needed,
+ // but 1 is the maximum valid probability and thus the best we can do.
+ //
+ // TODO: Cover this case in the test suite if you can.
+ ProbCompute = 1;
+ } else {
+ ProbCompute = ProbComputeNumerator / ProbComputeDenominator;
+ ProbCompute = std::max(ProbCompute, 0.);
+ ProbCompute = std::min(ProbCompute, 1.);
+ }
+
+ // Compute the resulting total frequency.
+ if (ProbReachingBackedge * ProbCompute == 1) {
+ // Analytically, this case seems impossible. It requires that there is a
+ // backedge and that FreqDesired == infinity so that every conditional
+ // latch's probability had to be set to 1. But FreqDesired == infinity
+ // means OriginalLoopProb.isOne(), which we guarded against earlier.
+ //
+ // Numerically, perhaps this case is possible. We interpret it to mean
+ // that analytically the probability has to be so near 1 that, in floating
+ // point, the frequency is computed as infinite.
+ //
+ // TODO: Cover this case in the test suite if you can.
+ Freq = std::numeric_limits<double>::infinity();
+ } else {
+ assert(FreqBeforeCompute > 0 &&
+ "Expected at least one iteration before first latch");
+ // In this equation, if we replace the left-hand side with FreqDesired and
+ // then solve for ProbCompute, we get the ProbCompute formula above.
+ Freq = (FreqBeforeCompute + Freq * ProbCompute) /
+ (1 - ProbReachingBackedge * ProbCompute);
+ }
+ return {ProbCompute, Freq};
+ };
+
// Determine and set branch weights.
if (CondLatches.size() == 1) {
SetAllProbs(ComputeProbForLinear());
} else if (CondLatches.size() == 2) {
SetAllProbs(ComputeProbForQuadratic());
} else {
- // FIXME: Handle CondLatches.size() > 2.
+ // The polynomial is too complex for a simple formula, so the quick and
+ // dirty fix has been selected. Adjust probabilities starting from the
+ // first latch, which has the most influence on the total frequency, so
+ // starting there should minimize the number of latches that have to be
+ // visited. We do have to iterate because the first latch alone might not
+ // be enough. For example, we might need to set all probabilities to 1 if
+ // the frequency is the unroll factor.
+ for (unsigned I = 0; I != CondLatches.size(); ++I) {
+ double Prob, Freq;
+ std::tie(Prob, Freq) = ComputeProb(I);
+ SetProb(I, Prob);
+ if (fabs(Freq - FreqDesired) < FreqPrec)
+ break;
+ }
}
// FIXME: We have not considered non-latch loop exits:
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
index 3d87ee185b554..353e74be9fbd1 100644
--- a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
@@ -491,6 +491,486 @@
; UR313x: br label %do.end
; UR313x: !0 = !{!"branch_weights", i32 -2147483648, i32 0}
+; ------------------------------------------------------------------------------
+; Check 4 max iterations:
+; - Unroll count of >=4 should always produce complete unrolling.
+; - That produces <=3 unrolled iteration latches. 3 is the lowest number where
+; the implementation cannot compute uniform weights using a simple formula.
+;
+; Original loop body frequency is 5 (loop weight 4), which is impossibly high.
+;
+; First use a variable iteration count so that all non-final unrolled
+; iterations' latches remain conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/4/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4510
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4510
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4510
+;
+; The sum of the new do.body* cannot reach the old do.body, which is
+; impossibly high.
+; ORIG4510: - do.body: float = 5.0,
+; UR4510: - do.body: float = 1.0,
+; UR4510: - do.body.1: float = 1.0,
+; UR4510: - do.body.2: float = 1.0,
+; UR4510: - do.body.3: float = 1.0,
+;
+; The probabilities are maximized to try to reach the original frequency.
+; UR4510: call void @f
+; UR4510: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR4510: call void @f
+; UR4510: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+; UR4510: call void @f
+; UR4510: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+; UR4510: call void @f
+; UR4510: br label %do.end
+; UR4510: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Now use a constant iteration count so that all non-final unrolled
+; iterations' latches unconditionally continue.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/4/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4540
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4540
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4540
+;
+; The new do.body contains 4 of the original loop's iterations, so multiply
+; it by 4, which is less than the old do.body, which is impossibly high.
+; ORIG4540: - do.body: float = 5.0,
+; UR4540: - do.body: float = 1.0,
+;
+; UR4540: call void @f
+; UR4540-NOT: br
+; UR4540: call void @f
+; UR4540-NOT: br
+; UR4540: call void @f
+; UR4540-NOT: br
+; UR4540: call void @f
+; UR4540: ret void
+;
+; Use a constant iteration count but now the loop upper bound computation can
+; overflow. When it does, the loop induction variable is greater than it
+; immediately, so the initial unrolled iteration's latch remains conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/4/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+; RUN: %{bf-fc} ORIG454x
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR454x
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR454x
+;
+; The new do.body.1 contains 3 of the original loop's iterations, so
+; multiply it by 3, and add the new do.body, but that sum is less than the
+; old do.body, which is impossibly high.
+; ORIG454x: - do.body: float = 5.0,
+; UR454x: - do.body: float = 1.0,
+; UR454x: - do.body.1: float = 1.0,
+;
+; The sole probability is maximized to try to reach the original frequency.
+; UR454x: call void @f
+; UR454x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR454x: call void @f
+; UR454x-NOT: br
+; UR454x: call void @f
+; UR454x-NOT: br
+; UR454x: call void @f
+; UR454x: br label %do.end
+; UR454x: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Original loop body frequency is 4 (loop weight 3).
+;
+; First use a variable iteration count so that all non-final unrolled
+; iterations' latches remain conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/3/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4410
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4410
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4410
+;
+; The sum of the new do.body* is the old do.body.
+; ORIG4410: - do.body: float = 4.0,
+; UR4410: - do.body: float = 1.0,
+; UR4410: - do.body.1: float = 1.0,
+; UR4410: - do.body.2: float = 1.0,
+; UR4410: - do.body.3: float = 1.0,
+;
+; UR4410: call void @f
+; UR4410: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR4410: call void @f
+; UR4410: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+; UR4410: call void @f
+; UR4410: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+; UR4410: call void @f
+; UR4410: br label %do.end
+; UR4410: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Now use a constant iteration count so that all non-final unrolled
+; iterations' latches unconditionally continue.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/3/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4440
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4440
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4440
+;
+; The new do.body contains 4 of the original loop's iterations, so multiply
+; it by 4 to get the old do.body.
+; ORIG4440: - do.body: float = 4.0,
+; UR4440: - do.body: float = 1.0,
+;
+; UR4440: call void @f
+; UR4440-NOT: br
+; UR4440: call void @f
+; UR4440-NOT: br
+; UR4440: call void @f
+; UR4440-NOT: br
+; UR4440: call void @f
+; UR4440: ret void
+;
+; Use a constant iteration count but now the loop upper bound computation can
+; overflow. When it does, the loop induction variable is greater than it
+; immediately, so the initial unrolled iteration's latch remains conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/3/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+; RUN: %{bf-fc} ORIG444x
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR444x
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR444x
+;
+; The new do.body.1 contains 3 of the original loop's iterations, so
+; multiply it by 3, and add the new do.body to get the old do.body.
+; ORIG444x: - do.body: float = 4.0,
+; UR444x: - do.body: float = 1.0,
+; UR444x: - do.body.1: float = 1.0,
+;
+; UR444x: call void @f
+; UR444x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR444x: call void @f
+; UR444x-NOT: br
+; UR444x: call void @f
+; UR444x-NOT: br
+; UR444x: call void @f
+; UR444x: br label %do.end
+; UR444x: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Original loop body frequency is 3 (loop weight 2). This is our first case
+; where the new probabilities vary.
+;
+; First use a variable iteration count so that all non-final unrolled
+; iterations' latches remain conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/2/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4310
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4310
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4310
+;
+; The sum of the new do.body* is always approximately the old do.body.
+; ORIG4310: - do.body: float = 3.0,
+; UR4310: - do.body: float = 1.0,
+; UR4310: - do.body.1: float = 0.94737,
+; UR4310: - do.body.2: float = 0.63158,
+; UR4310: - do.body.3: float = 0.42105,
+;
+; UR4310: call void @f
+; UR4310: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR4310: call void @f
+; UR4310: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !1
+; UR4310: call void @f
+; UR4310: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !1
+; UR4310: call void @f
+; UR4310: br label %do.end
+; UR4310: !0 = !{!"branch_weights", i32 113025456, i32 2034458192}
+; UR4310: !1 = !{!"branch_weights", i32 1, i32 2}
+;
+; Now use a constant iteration count so that all non-final unrolled
+; iterations' latches unconditionally continue.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/2/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4340
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4340
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4340
+;
+; The new do.body contains 4 of the original loop's iterations, so multiply
+; it by 4, which is greater than the old do.body, which is impossibly low.
+; ORIG4340: - do.body: float = 3.0,
+; UR4340: - do.body: float = 1.0,
+;
+; UR4340: call void @f
+; UR4340-NOT: br
+; UR4340: call void @f
+; UR4340-NOT: br
+; UR4340: call void @f
+; UR4340-NOT: br
+; UR4340: call void @f
+; UR4340: ret void
+;
+; Use a constant iteration count but now the loop upper bound computation can
+; overflow. When it does, the loop induction variable is greater than it
+; immediately, so the initial unrolled iteration's latch remains conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/2/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+; RUN: %{bf-fc} ORIG434x
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR434x
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR434x
+;
+; The new do.body.1 contains 3 of the original loop's iterations, so
+; multiply it by 3, and add the new do.body to get the old do.body.
+; ORIG434x: - do.body: float = 3.0,
+; UR434x: - do.body: float = 1.0,
+; UR434x: - do.body.1: float = 0.66667,
+;
+; UR434x: call void @f
+; UR434x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR434x: call void @f
+; UR434x-NOT: br
+; UR434x: call void @f
+; UR434x-NOT: br
+; UR434x: call void @f
+; UR434x: br label %do.end
+; UR434x: !0 = !{!"branch_weights", i32 715827884, i32 1431655764}
+;
+; Original loop body frequency is 2 (loop weight 1).
+;
+; First use a variable iteration count so that all non-final unrolled
+; iterations' latches remain conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/1/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4210
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4210
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4210
+;
+; The sum of the new do.body* is always the old do.body.
+; ORIG4210: - do.body: float = 2.0,
+; UR4210: - do.body: float = 1.0,
+; UR4210: - do.body.1: float = 0.57143,
+; UR4210: - do.body.2: float = 0.28571,
+; UR4210: - do.body.3: float = 0.14286,
+;
+; UR4210: call void @f
+; UR4210: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR4210: call void @f
+; UR4210: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !1
+; UR4210: call void @f
+; UR4210: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !1
+; UR4210: call void @f
+; UR4210: br label %do.end
+; UR4210: !0 = !{!"branch_weights", i32 920350135, i32 1227133513}
+; UR4210: !1 = !{!"branch_weights", i32 1, i32 1}
+;
+; Now use a constant iteration count so that all non-final unrolled
+; iterations' latches unconditionally continue.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/1/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4240
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4240
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4240
+;
+; The new do.body contains 4 of the original loop's iterations, so multiply
+; it by 4, which is greater than the old do.body, which is impossibly low.
+; ORIG4240: - do.body: float = 2.0,
+; UR4240: - do.body: float = 1.0,
+;
+; UR4240: call void @f
+; UR4240-NOT: br
+; UR4240: call void @f
+; UR4240-NOT: br
+; UR4240: call void @f
+; UR4240-NOT: br
+; UR4240: call void @f
+; UR4240: ret void
+;
+; Use a constant iteration count but now the loop upper bound computation can
+; overflow. When it does, the loop induction variable is greater than it
+; immediately, so the initial unrolled iteration's latch remains conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/1/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+; RUN: %{bf-fc} ORIG424x
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR424x
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR424x
+;
+; The new do.body.1 contains 3 of the original loop's iterations, so
+; multiply it by 3, and add the new do.body to get the old do.body.
+; ORIG424x: - do.body: float = 2.0,
+; UR424x: - do.body: float = 1.0,
+; UR424x: - do.body.1: float = 0.33333,
+;
+; UR424x: call void @f
+; UR424x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR424x: call void @f
+; UR424x-NOT: br
+; UR424x: call void @f
+; UR424x-NOT: br
+; UR424x: call void @f
+; UR424x: br label %do.end
+; UR424x: !0 = !{!"branch_weights", i32 1431655765, i32 715827883}
+;
+; Original loop body frequency is 1 (loop weight 0).
+;
+; First use a variable iteration count so that all non-final unrolled
+; iterations' latches remain conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/0/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4110
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4110
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4110
+;
+; The sum of the new do.body* is approximately the old do.body.
+; ORIG4110: - do.body: float = 1.0,
+; UR4110: - do.body: float = 1.0,
+; UR4110: - do.body.1: float = 0.0{{(0000[0-9]*)?}},
+; UR4110: - do.body.2: float = 0.0{{(0000[0-9]*)?}},
+; UR4110: - do.body.3: float = 0.0{{(0000[0-9]*)?}},
+;
+; UR4110: call void @f
+; UR4110: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR4110: call void @f
+; UR4110: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+; UR4110: call void @f
+; UR4110: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+; UR4110: call void @f
+; UR4110: br label %do.end
+; UR4110: !0 = !{!"branch_weights", i32 1, i32 0}
+;
+; Now use a constant iteration count so that all non-final unrolled
+; iterations' latches unconditionally continue.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/0/ -e s/@MIN@/4/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG4140
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4140
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR4140
+;
+; The new do.body contains 4 of the original loop's iterations, so multiply
+; it by 4, which is greater than the old do.body, which is impossibly low.
+; ORIG4140: - do.body: float = 1.0,
+; UR4140: - do.body: float = 1.0,
+;
+; UR4140: call void @f
+; UR4140-NOT: br
+; UR4140: call void @f
+; UR4140-NOT: br
+; UR4140: call void @f
+; UR4140-NOT: br
+; UR4140: call void @f
+; UR4140: ret void
+;
+; Use a constant iteration count but now the loop upper bound computation can
+; overflow. When it does, the loop induction variable is greater than it
+; immediately, so the initial unrolled iteration's latch remains conditional.
+;
+; RUN: sed -e s/@MAX@/4/ -e s/@W@/0/ -e s/@MIN@/4/ -e s/@I_0@/%x/ %s > %t.ll
+; RUN: %{bf-fc} ORIG414x
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR414x
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR414x
+;
+; The new do.body.1 contains 3 of the original loop's iterations, so
+; multiply it by 3, and add the new do.body to get approximately the old
+; do.body.
+; ORIG414x: - do.body: float = 1.0,
+; UR414x: - do.body: float = 1.0,
+; UR414x: - do.body.1: float = 0.0{{(0000[0-9]*)?}},
+;
+; UR414x: call void @f
+; UR414x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR414x: call void @f
+; UR414x-NOT: br
+; UR414x: call void @f
+; UR414x-NOT: br
+; UR414x: call void @f
+; UR414x: br label %do.end
+; UR414x: !0 = !{!"branch_weights", i32 -2147483648, i32 0}
+
+; ------------------------------------------------------------------------------
+; Check 5 max iterations:
+; - Unroll count of >=5 should always produce complete unrolling.
+; - That produces <=4 unrolled iteration latches. When at least 3 remain
+; conditional, the implementation cannot compute uniform weights using a
+; simple formula.
+;
+; Original loop body frequency is 5 (loop weight 4).
+;
+; RUN: sed -e s/@MAX@/5/ -e s/@W@/4/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG5510
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR5510
+; RUN: %{ur-bf} -unroll-count=6 | %{fc} UR5510
+;
+; The sum of the new do.body* is the old do.body.
+; ORIG5510: - do.body: float = 5.0,
+; UR5510: - do.body: float = 1.0,
+; UR5510: - do.body.1: float = 1.0,
+; UR5510: - do.body.2: float = 1.0,
+; UR5510: - do.body.3: float = 1.0,
+; UR5510: - do.body.4: float = 1.0,
+;
+; All continue probabilities are approximately 1, but somehow there is less
+; precision in the calculation of the last case.
+; UR5510: call void @f
+; UR5510: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR5510: call void @f
+; UR5510: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+; UR5510: call void @f
+; UR5510: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+; UR5510: call void @f
+; UR5510: br i1 %{{.*}}, label %do.end, label %do.body.4, !prof !1
+; UR5510: call void @f
+; UR5510: br label %do.end
+; UR5510: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+; UR5510: !1 = !{!"branch_weights", i32 10, i32 2147483638}
+;
+; Original loop body frequency is 4 (loop weight 3).
+;
+; RUN: sed -e s/@MAX@/5/ -e s/@W@/3/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG5410
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR5410
+; RUN: %{ur-bf} -unroll-count=6 | %{fc} UR5410
+;
+; The sum of the new do.body* is always the old do.body.
+; ORIG5410: - do.body: float = 4.0,
+; UR5410: - do.body: float = 1.0,
+; UR5410: - do.body.1: float = 1.0,
+; UR5410: - do.body.2: float = 0.86486,
+; UR5410: - do.body.3: float = 0.64865,
+; UR5410: - do.body.4: float = 0.48649,
+;
+; This is our first case where the implementation must adjust multiple
+; probabilities to something other than the original latch probability but
+; does not just set all probabilities to the limit of 1 or 0.
+; UR5410: call void @f
+; UR5410: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR5410: call void @f
+; UR5410: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !1
+; UR5410: call void @f
+; UR5410: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !2
+; UR5410: call void @f
+; UR5410: br i1 %{{.*}}, label %do.end, label %do.body.4, !prof !2
+; UR5410: call void @f
+; UR5410: br label %do.end
+; UR5410: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+; UR5410: !1 = !{!"branch_weights", i32 290200493, i32 1857283155}
+; UR5410: !2 = !{!"branch_weights", i32 1, i32 3}
+;
+; Original loop body frequency is 1 (loop weight 0).
+;
+; RUN: sed -e s/@MAX@/5/ -e s/@W@/0/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+; RUN: %{bf-fc} ORIG5110
+; RUN: %{ur-bf} -unroll-count=5 | %{fc} UR5110
+; RUN: %{ur-bf} -unroll-count=6 | %{fc} UR5110
+;
+; The sum of the new do.body* is approximately the old do.body.
+; ORIG5110: - do.body: float = 1.0,
+; UR5110: - do.body: float = 1.0,
+; UR5110: - do.body.1: float = 0.0{{(0000[0-9]*)?}},
+; UR5110: - do.body.2: float = 0.0{{(0000[0-9]*)?}},
+; UR5110: - do.body.3: float = 0.0{{(0000[0-9]*)?}},
+; UR5110: - do.body.4: float = 0.0{{(0000[0-9]*)?}},
+;
+; UR5110: call void @f
+; UR5110: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+; UR5110: call void @f
+; UR5110: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+; UR5110: call void @f
+; UR5110: br i1 %{{.*}}, label %do.end, label %do.body.3, !prof !0
+; UR5110: call void @f
+; UR5110: br i1 %{{.*}}, label %do.end, label %do.body.4, !prof !0
+; UR5110: call void @f
+; UR5110: br label %do.end
+; UR5110: !0 = !{!"branch_weights", i32 1, i32 0}
+
declare void @f(i32)
define void @test(i32 %x, i32 %n) {
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
index 09ecaebcf1f45..62675162544f8 100644
--- a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
@@ -138,6 +138,74 @@
; - It has no llvm.loop.estimated_trip_count.
; UR4-EUR: !6 = !{!"branch_weights", i32 1265493781, i32 881989867}
+; ------------------------------------------------------------------------------
+; Check -unroll-count=8.
+;
+; RUN: %{ur-bf} -unroll-count=8 | %{fc} UR8,UR8-ELP
+; RUN: %{ur-bf} -unroll-count=8 -unroll-remainder | \
+; RUN: %{fc} UR8,UR8-EUR
+;
+; Multiply do.body by 8 and add do.body.epil* for either ELP or EUR to get the
+; original loop body frequency, 11.
+; UR8: - do.body: float = 0.96188,
+; UR8-ELP: - do.body.epil: float = 3.3049,
+; UR8-EUR: - do.body.epil: float = 0.91256,
+; UR8-EUR: - do.body.epil.1: float = 0.7716,
+; UR8-EUR: - do.body.epil.2: float = 0.55854,
+; UR8-EUR: - do.body.epil.3: float = 0.40432,
+; UR8-EUR: - do.body.epil.4: float = 0.29268,
+; UR8-EUR: - do.body.epil.5: float = 0.21186,
+; UR8-EUR: - do.body.epil.6: float = 0.15336,
+;
+; Unrolled loop guard, body, and latch.
+; UR8: br i1 %{{.*}}, label %do.body.epil.preheader, label %entry.new, !prof !0
+; UR8-COUNT-8: call void @f
+; UR8: br i1 %{{.*}}, label %do.end.unr-lcssa, label %do.body, !prof !1, !llvm.loop !2
+;
+; Epilogue guard.
+; UR8: br i1 %{{.*}}, label %do.body.epil.preheader, label %do.end, !prof !5
+;
+; Non-unrolled epilogue loop.
+; UR8-ELP: call void @f
+; UR8-ELP: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
+;
+; Completely unrolled epilogue loop.
+; UR8-EUR: call void @f
+; UR8-EUR: br i1 %{{.*}}, label %do.body.epil.1, label %do.end.epilog-lcssa, !prof !6
+; UR8-EUR: call void @f
+; UR8-EUR: br i1 %{{.*}}, label %do.body.epil.2, label %do.end.epilog-lcssa, !prof !7
+; UR8-EUR: call void @f
+; UR8-EUR: br i1 %{{.*}}, label %do.body.epil.3, label %do.end.epilog-lcssa, !prof !7
+; UR8-EUR: call void @f
+; UR8-EUR: br i1 %{{.*}}, label %do.body.epil.4, label %do.end.epilog-lcssa, !prof !7
+; UR8-EUR: call void @f
+; UR8-EUR: br i1 %{{.*}}, label %do.body.epil.5, label %do.end.epilog-lcssa, !prof !7
+; UR8-EUR: call void @f
+; UR8-EUR: br i1 %{{.*}}, label %do.body.epil.6, label %do.end.epilog-lcssa, !prof !7
+; UR8-EUR: call void @f
+;
+; Unrolled loop metadata.
+; UR8: !0 = !{!"branch_weights", i32 1045484980, i32 1101998668}
+; UR8: !1 = !{!"branch_weights", i32 1145666677, i32 1001816971}
+; UR8: !2 = distinct !{!2, !3, !4}
+; UR8: !3 = !{!"llvm.loop.estimated_trip_count", i32 1}
+; UR8: !4 = !{!"llvm.loop.unroll.disable"}
+; UR8: !5 = !{!"branch_weights", i32 1781544591, i32 365939057}
+;
+; Non-unrolled epilogue loop metadata.
+; UR8-ELP: !6 = !{!"branch_weights", i32 1554520665, i32 592962983}
+; UR8-ELP: !7 = distinct !{!7, !8, !4}
+; UR8-ELP: !8 = !{!"llvm.loop.estimated_trip_count", i32 3}
+;
+; Completely unrolled epilogue loop metadata. Because it loses its backedge:
+; - The remaining conditional latches' branch weights must be adjusted relative
+; to the non-unrolled case. There are many, so the implementation does not
+; compute uniform branch weights. Adjusting the first is sufficient, so the
+; second is the same as the non-unrolled epilogue branch weights.
+; - It has no llvm.loop.estimated_trip_count.
+; UR8-EUR: !6 = !{!"branch_weights", i32 1815773828, i32 331709820}
+; UR8-EUR: !7 = !{!"branch_weights", i32 1554520665, i32 592962983}
+
; ------------------------------------------------------------------------------
; Check -unroll-count=10.
;
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll
index 09b2097d13582..a87c9390ee780 100644
--- a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll
@@ -133,6 +133,38 @@
; MULT4: !1 = distinct !{!1, !2, !3}
; MULT4: !2 = !{!"llvm.loop.estimated_trip_count", i32 3}
; MULT4: !3 = !{!"llvm.loop.unroll.disable"}
+;
+; -unroll-count=6, so there are 3 remaining conditional latches, the lowest
+; number where the implementation cannot compute uniform weights using a
+; simple formula.
+;
+; RUN: %{ur-bf} -unroll-count=6 | %{fc} MULT6
+;
+; Multiply by 2 and sum to get the original loop body frequency, 10.
+; MULT6: - do.body: float = 2.1956,
+; MULT6: - do.body.2: float = 1.476,
+; MULT6: - do.body.4: float = 1.3284,
+;
+; MULT6: call void @f
+; MULT6-NOT: br
+; MULT6: call void @f
+; MULT6: br i1 %{{.*}}, label %do.body.2, label %do.end, !prof !0
+; MULT6: call void @f
+; MULT6-NOT: br
+; MULT6: call void @f
+; MULT6: br i1 %{{.*}}, label %do.body.4, label %do.end, !prof !1
+; MULT6: call void @f
+; MULT6-NOT: br
+; MULT6: call void @f
+; MULT6: br i1 %{{.*}}, label %do.body, label %do.end, !prof !1, !llvm.loop !2
+;
+; There are 3 conditional latches remaining, so it adjusts the first and
+; leaves the second two with the original loop's branch weights.
+; MULT6: !0 = !{!"branch_weights", i32 1443686486, i32 703797162}
+; MULT6: !1 = !{!"branch_weights", i32 9, i32 1}
+; MULT6: !2 = distinct !{!2, !3, !4}
+; MULT6: !3 = !{!"llvm.loop.estimated_trip_count", i32 2}
+; MULT6: !4 = !{!"llvm.loop.unroll.disable"}
; ------------------------------------------------------------------------------
; Check case when the original loop's number of iterations is a run-time
@@ -193,6 +225,40 @@
; LOW4: !1 = distinct !{!1, !2, !3}
; LOW4: !2 = !{!"llvm.loop.estimated_trip_count", i32 1}
; LOW4: !3 = !{!"llvm.loop.unroll.disable"}
+;
+; -unroll-count=6, so there are 3 remaining conditional latches. The
+; implementation cannot compute uniform weights using a simple formula, and
+; ultimately it must set all those latches' probabilities to zero. The
+; implementation will face a new stumbling block starting at the second latch:
+; reaching the remaining iterations already has a zero probability due to the
+; zero probability set at the first latch, so the required probability could
+; accidentally be computed as negative infinity.
+;
+; RUN: %{ur-bf} -unroll-count=6 | %{fc} LOW6
+;
+; Multiply by 2 and sum, but the result is greater than the original loop body
+; frequency, 1, which is impossibly low.
+; LOW6: - do.body: float = 1.0,
+; LOW6: - do.body.2: float = 0.0{{(0000[0-9]*)?}},
+; LOW6: - do.body.4: float = 0.0{{(0000[0-9]*)?}},
+;
+; LOW6: call void @f
+; LOW6-NOT: br
+; LOW6: call void @f
+; LOW6: br i1 %{{.*}}, label %do.body.2, label %do.end, !prof !0
+; LOW6: call void @f
+; LOW6-NOT: br
+; LOW6: call void @f
+; LOW6: br i1 %{{.*}}, label %do.body.4, label %do.end, !prof !0
+; LOW6: call void @f
+; LOW6-NOT: br
+; LOW6: call void @f
+; LOW6: br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1
+;
+; LOW6: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+; LOW6: !1 = distinct !{!1, !2, !3}
+; LOW6: !2 = !{!"llvm.loop.estimated_trip_count", i32 1}
+; LOW6: !3 = !{!"llvm.loop.unroll.disable"}
; ------------------------------------------------------------------------------
; Check cases when the original loop's number of iterations is a constant 10 and
More information about the llvm-branch-commits
mailing list