[llvm-branch-commits] [llvm] [LoopUnroll] Fix freqs for unconditional latches: introduce tests (PR #191008)

Wed Apr 8 09:30:14 PDT 2026

https://github.com/jdenny-ornl created https://github.com/llvm/llvm-project/pull/191008

This patch introduces all tests for PR #179520 but with current results so that it is easier to see which results PR #179520 improves. This patch should not land without PR #179520.

>From 1415b28d9801773fe8c40ed37fc5fa10a2455313 Mon Sep 17 00:00:00 2001
From: "Joel E. Denny" <jdenny.ornl at gmail.com>
Date: Wed, 8 Apr 2026 11:54:04 -0400
Subject: [PATCH] [LoopUnroll] Fix freqs for unconditional latches: introduce
 tests

This patch introduces all tests for PR #179520 but with current
results so that it is easier to see which results PR #179520 improves.
This patch should not land without PR #179520.
---
 .../branch-weights-freq/unroll-complete.ll    | 530 ++++++++++++++++++
 .../branch-weights-freq/unroll-epilog.ll      | 270 +++++++--
 .../unroll-partial-unconditional-latch.ll     | 280 +++++++++
 .../branch-weights-freq/unroll-partial.ll     |   3 +-
 .../LoopUnroll/loop-probability-one.ll        | 201 ++++---
 5 files changed, 1145 insertions(+), 139 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
 create mode 100644 llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll

diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
new file mode 100644
index 0000000000000..fd7df00515e25
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-complete.ll
@@ -0,0 +1,530 @@
+; Test branch weight metadata, estimated trip count metadata, and block
+; frequencies after complete loop unrolling.  The final unrolled iteration
+; unconditionally exits (backedge removed), and other unrolled iterations'
+; latches might unconditionally continue.  Either contradicts the original
+; branch weights.
+;
+; (unroll-partial-unconditional-latch.ll tests partial unrolling cases,
+; including cases where the latch of any iteration, including the final, might
+; unconditionally continue.)
+;
+; For each case, we check:
+; - Iteration frequencies
+;   - When each is multiplied by the number of original loop bodies that execute
+;     within it, they should sum to almost exactly the original loop body
+;     frequency.
+;   - The only exception is an impossibly high or low original frequency (e.g.,
+;     due to bad profile data), for which there exist no new branch weights that
+;     can yield that frequency sum.  In those cases, we expect the maximum or
+;     minimum possible frequency.
+; - CFGs
+;   - We verify which branch weights go with which branches and that we did not
+;     overlook any other branch weights (no extra !prof or branch_weights).
+;   - We also check the number of original loop bodies (represented by a call to
+;     @f) that appear within each unrolled iteration.
+; - Branch weight metadata
+;   - Checking frequencies already checks whether the branch weights have the
+;     expected effect, but we also want to check that we get uniform
+;     probabilities/weights (same !prof) across the unrolled iteration latches
+;     when expected.
+; - llvm.loop.estimated_trip_count:
+;   - There should be none because loops are completely unrolled.
+
+; ------------------------------------------------------------------------------
+; Define LIT substitutions.
+;
+; Before using the following lit substitutions, sed should be called to replace
+; these parameters in %s to produce %t.ll:
+; - @I_0@ is the starting value for the original loop's induction variable.
+; - @MIN@ and @MAX@ are the compile-time known minimum and maximum for the
+;   number of original loop iterations, regardless of @I_0 at .
+; - @W@ is the branch weight for the original loop's backedge.  That value plus
+;   1 is the original loop body frequency because the exit branch weight is 1.
+;
+; For verifying that the test code produces the original loop body frequency we
+; expect.
+; DEFINE: %{bf-fc} = opt %t.ll -S -passes='print<block-freq>' 2>&1 | \
+; DEFINE:   FileCheck %s -check-prefixes
+;
+; For checking the unrolled loop.
+; DEFINE: %{ur-bf} = opt %t.ll -S -passes='loop-unroll,print<block-freq>' 2>&1
+; DEFINE: %{fc} = FileCheck %s \
+; DEFINE:     -implicit-check-not='llvm.loop.estimated_trip_count' \
+; DEFINE:     -implicit-check-not='!prof' \
+; DEFINE:     -implicit-check-not='branch_weights' \
+; DEFINE:     -implicit-check-not='call void @f' -check-prefixes
+
+; ------------------------------------------------------------------------------
+; Check 1 max iteration:
+; - Unroll count of >=1 should always produce complete unrolling.
+; - That produces 0 unrolled iteration latches, so there are no branch weights
+;   to compute.
+;
+; Original loop body frequency is 2 (loop weight 1), which is impossibly high.
+;
+;   RUN: sed -e s/@MAX@/1/ -e s/@W@/1/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;   RUN: %{bf-fc} ORIG1210
+;   RUN: %{ur-bf} -unroll-count=1 | %{fc} UR1210
+;   RUN: %{ur-bf} -unroll-count=2 | %{fc} UR1210
+;
+;   The new do.body is less than the old do.body, which is impossibly high.
+;   ORIG1210: - do.body: float = 2.0,
+;   UR1210:   - do.body: float = 1.0,
+;
+;   UR1210: call void @f
+;
+; Original loop body frequency is 1 (loop weight 0).
+;
+;   RUN: sed -e s/@MAX@/1/ -e s/@W@/0/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;   RUN: %{bf-fc} ORIG1110
+;   RUN: %{ur-bf} -unroll-count=1 | %{fc} UR1110
+;   RUN: %{ur-bf} -unroll-count=2 | %{fc} UR1110
+;
+;   The the new do.body equals the old do.body.
+;   ORIG1110: - do.body: float = 1.0,
+;   UR1110:   - do.body: float = 1.0,
+;
+;   UR1110: call void @f
+
+; ------------------------------------------------------------------------------
+; Check 2 max iterations:
+; - Unroll count of >=2 should always produce complete unrolling.
+; - That produces <=1 unrolled iteration latch, so the implementation can
+;   compute uniform weights by solving, at worst, a linear equation.
+;
+; Original loop body frequency is 3 (loop weight 2), which is impossibly high.
+;
+;   First use a variable iteration count so that the sole non-final unrolled
+;   iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/2/ -e s/@W@/2/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG2310
+;     RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2310
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR2310
+;
+;     The sum of the new do.body* cannot reach the old do.body, which is
+;     impossibly high.
+;     ORIG2310: - do.body: float = 3.0,
+;     UR2310:   - do.body: float = 1.0,
+;     FIXME: Should be 1.0:
+;     UR2310:   - do.body.1: float = 0.66667
+;
+;     The sole probability is maximized to try to reach the original frequency.
+;     UR2310: call void @f
+;     UR2310: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR2310: call void @f
+;     UR2310: br label %do.end
+;     FIXME: Should be (0, non-zero):
+;     UR2310: !0 = !{!"branch_weights", i32 1, i32 2}
+;
+;   Now use a constant iteration count so that the sole non-final unrolled
+;   iteration's latch unconditionally continues.
+;
+;     RUN: sed -e s/@MAX@/2/ -e s/@W@/2/ -e s/@MIN@/2/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG2320
+;     RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2320
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR2320
+;
+;     The new do.body contains 2 of the original loop's iterations, so multiply
+;     it by 2, which is less than the old do.body, which is impossibly high.
+;     ORIG2320: - do.body: float = 3.0,
+;     UR2320:   - do.body: float = 1.0,
+;
+;     UR2320:     call void @f
+;     UR2320-NOT: br
+;     UR2320:     call void @f
+;     UR2320:     ret void
+;
+; Original loop body frequency is 2 (loop weight 1).
+;
+;   First use a variable iteration count so that the sole non-final unrolled
+;   iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/2/ -e s/@W@/1/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG2210
+;     RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2210
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR2210
+;
+;     The sum of the new do.body* is the old do.body.
+;     ORIG2210: - do.body: float = 2.0,
+;     UR2210:   - do.body: float = 1.0,
+;     FIXME: Should be 1.0:
+;     UR2210:   - do.body.1: float = 0.5,
+;
+;     UR2210: call void @f
+;     UR2210: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR2210: call void @f
+;     UR2210: br label %do.end
+;     FIXME: Should be (0, non-zero):
+;     UR2210: !0 = !{!"branch_weights", i32 1, i32 1}
+;
+;   Now use a constant iteration count so that the sole non-final unrolled
+;   iteration's latch unconditionally continues.
+;
+;     RUN: sed -e s/@MAX@/2/ -e s/@W@/1/ -e s/@MIN@/2/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG2220
+;     RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2220
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR2220
+;
+;     The new do.body contains 2 of the original loop's iterations, so multiply
+;     it by 2 to get the old do.body.
+;     ORIG2220: - do.body: float = 2.0,
+;     UR2220:   - do.body: float = 1.0,
+;
+;     UR2220:     call void @f
+;     UR2220-NOT: br
+;     UR2220:     call void @f
+;     UR2220:     ret void
+;
+; Original loop body frequency is 1 (loop weight 0).
+;
+;   First use a variable iteration count so that the sole non-final unrolled
+;   iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/2/ -e s/@W@/0/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG2110
+;     RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2110
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR2110
+;
+;     The sum of the new do.body* is approximately the old do.body.
+;     ORIG2110: - do.body: float = 1.0,
+;     UR2110:   - do.body: float = 1.0,
+;     UR2110:   - do.body.1: float = 0.0{{(0000[0-9]*)?}},
+;
+;     UR2110: call void @f
+;     UR2110: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR2110: call void @f
+;     UR2110: br label %do.end
+;     UR2110: !0 = !{!"branch_weights", i32 1, i32 0}
+;
+;   Now use a constant iteration count so that the sole non-final unrolled
+;   iteration's latch unconditionally continues.
+;
+;     RUN: sed -e s/@MAX@/2/ -e s/@W@/0/ -e s/@MIN@/2/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG2120
+;     RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2120
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR2120
+;
+;     The new do.body contains 2 of the original loop's iterations, so multiply
+;     it by 2, which is greater than the old do.body, which is impossibly low.
+;     ORIG2120: - do.body: float = 1.0,
+;     UR2120:   - do.body: float = 1.0,
+;
+;     UR2120:     call void @f
+;     UR2220-NOT: br
+;     UR2120:     call void @f
+;     UR2120:     ret void
+
+; ------------------------------------------------------------------------------
+; Check 3 max iterations:
+; - Unroll count of >=3 should always produce complete unrolling.
+; - That produces <=2 unrolled iteration latches, so the implementation can
+;   compute uniform weights solving, at worst, a quadratic equation.
+;
+; Original loop body frequency is 4 (loop weight 3), which is impossibly high.
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/3/ -e s/@W@/3/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG3410
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3410
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3410
+;
+;     The sum of the new do.body* cannot reach the old do.body, which is
+;     impossibly high.
+;     ORIG3410: - do.body: float = 4.0,
+;     UR3410:   - do.body: float = 1.0,
+;     FIXME: Should be 1.0:
+;     UR3410:   - do.body.1: float = 0.75,
+;     FIXME: Should be 1.0:
+;     UR3410:   - do.body.2: float = 0.5625,
+;
+;     The probabilities are maximized to try to reach the original frequency.
+;     UR3410: call void @f
+;     UR3410: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR3410: call void @f
+;     UR3410: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+;     UR3410: call void @f
+;     UR3410: br label %do.end
+;     FIXME: Should be (0, non-zero):
+;     UR3410: !0 = !{!"branch_weights", i32 1, i32 3}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/3/ -e s/@W@/3/ -e s/@MIN@/3/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG3430
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3430
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3430
+;
+;     The new do.body contains 3 of the original loop's iterations, so multiply
+;     it by 3, which is less than the old do.body, which is impossibly high.
+;     ORIG3430: - do.body: float = 4.0,
+;     UR3430:   - do.body: float = 1.0,
+;
+;     UR3430:     call void @f
+;     UR3430-NOT: br
+;     UR3430:     call void @f
+;     UR3430-NOT: br
+;     UR3430:     call void @f
+;     UR3430:     ret void
+;
+;   Use a constant iteration count but now the loop upper bound computation can
+;   overflow.  When it does, the loop induction variable is greater than it
+;   immediately, so the initial unrolled iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/3/ -e s/@W@/3/ -e s/@MIN@/3/ -e s/@I_0@/%x/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG343x
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR343x
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR343x
+;
+;     The new do.body.1 contains 2 of the original loop's iterations, so
+;     multiply it by 2, and add the new do.body, but that sum is less than the
+;     old do.body, which is impossibly high.
+;     ORIG343x: - do.body: float = 4.0,
+;     UR343x:   - do.body: float = 1.0,
+;     FIXME: Should be 1.0:
+;     UR343x:   - do.body.1: float = 0.75,
+;
+;     The sole probability is maximized to try to reach the original frequency.
+;     UR343x:     call void @f
+;     UR343x:     br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR343x:     call void @f
+;     UR343x-NOT: br
+;     UR343x:     call void @f
+;     UR343x:     ret void
+;     FIXME: Should be (0, non-zero):
+;     UR343x:     !0 = !{!"branch_weights", i32 1, i32 3}
+;
+; Original loop body frequency is 3 (loop weight 2).
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/3/ -e s/@W@/2/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG3310
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3310
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3310
+;
+;     The sum of the new do.body* is the old do.body.
+;     ORIG3310: - do.body: float = 3.0,
+;     UR3310:   - do.body: float = 1.0,
+;     FIXME: Should be 1.0:
+;     UR3310:   - do.body.1: float = 0.66667,
+;     FIXME: Should be 1.0:
+;     UR3310:   - do.body.2: float = 0.44444,
+;
+;     UR3310: call void @f
+;     UR3310: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR3310: call void @f
+;     UR3310: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+;     UR3310: call void @f
+;     UR3310: br label %do.end
+;     FIXME: Should be (very small, very large):
+;     UR3310: !0 = !{!"branch_weights", i32 1, i32 2}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/3/ -e s/@W@/2/ -e s/@MIN@/3/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG3330
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3330
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3330
+;
+;     The new do.body contains 3 of the original loop's iterations, so multiply
+;     it by 3 to get the old do.body.
+;     ORIG3330: - do.body: float = 3.0,
+;     UR3330:   - do.body: float = 1.0,
+;
+;     UR3330:     call void @f
+;     UR3330-NOT: br
+;     UR3330:     call void @f
+;     UR3330-NOT: br
+;     UR3330:     call void @f
+;     UR3330:     ret void
+;
+;   Use a constant iteration count but now the loop upper bound computation can
+;   overflow.  When it does, the loop induction variable is greater than it
+;   immediately, so the initial unrolled iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/3/ -e s/@W@/2/ -e s/@MIN@/3/ -e s/@I_0@/%x/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG333x
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR333x
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR333x
+;
+;     The new do.body.1 contains 2 of the original loop's iterations, so
+;     multiply it by 2, and add the new do.body to get the old do.body.
+;     ORIG333x: - do.body: float = 3.0,
+;     UR333x:   - do.body: float = 1.0,
+;     FIXME: Should be 1.0:
+;     UR333x:   - do.body.1: float = 0.66667,
+;
+;     UR333x:     call void @f
+;     UR333x: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR333x:     call void @f
+;     UR333x-NOT: br
+;     UR333x:     call void @f
+;     UR333x:     br label %do.end
+;     FIXME: Should be (very small, very large):
+;     UR333x:     !0 = !{!"branch_weights", i32 1, i32 2}
+;
+; Original loop body frequency is 2 (loop weight 1).  This is our first case
+; where new frequencies and probabilities are not all approximately 1 or 0.
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/3/ -e s/@W@/1/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG3210
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3210
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3210
+;
+;     The sum of the new do.body* is the old do.body.
+;     ORIG3210: - do.body: float = 2.0,
+;     UR3210:   - do.body: float = 1.0,
+;     FIXME: Should sum to 1.0:
+;     UR3210:   - do.body.1: float = 0.5,
+;     UR3210:   - do.body.2: float = 0.25,
+;
+;     UR3210: call void @f
+;     UR3210: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR3210: call void @f
+;     UR3210: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+;     UR3210: call void @f
+;     UR3210: br label %do.end
+;     UR3210: !0 = !{!"branch_weights", i32 1, i32 1}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/3/ -e s/@W@/1/ -e s/@MIN@/3/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG3230
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3230
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3230
+;
+;     The new do.body contains 3 of the original loop's iterations, so multiply
+;     it by 3, which is greater than the old do.body, which is impossibly low.
+;     ORIG3230: - do.body: float = 2.0,
+;     UR3230:   - do.body: float = 1.0,
+;
+;     UR3230:     call void @f
+;     UR3230-NOT: br
+;     UR3230:     call void @f
+;     UR3230-NOT: br
+;     UR3230:     call void @f
+;     UR3230:     ret void
+;
+;   Use a constant iteration count but now the loop upper bound computation can
+;   overflow.  When it does, the loop induction variable is greater than it
+;   immediately, so the initial unrolled iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/3/ -e s/@W@/1/ -e s/@MIN@/3/ -e s/@I_0@/%x/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG323x
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR323x
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR323x
+;
+;     The new do.body.1 contains 2 of the original loop's iterations, so
+;     multiply it by 2, and add the new do.body to get the old do.body.
+;     ORIG323x: - do.body: float = 2.0,
+;     UR323x:   - do.body: float = 1.0,
+;     UR323x:   - do.body.1: float = 0.5,
+;
+;     UR323x:     call void @f
+;     UR323x:     br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR323x:     call void @f
+;     UR323x-NOT: br
+;     UR323x:     call void @f
+;     UR323x:     br label %do.end
+;     UR323x:     !0 = !{!"branch_weights", i32 1, i32 1}
+;
+; Original loop body frequency is 1 (loop weight 0).
+;
+;   First use a variable iteration count so that all non-final unrolled
+;   iterations' latches remain conditional.
+;
+;     RUN: sed -e s/@MAX@/3/ -e s/@W@/0/ -e s/@MIN@/1/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG3110
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3110
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3110
+;
+;     The sum of the new do.body* is approximately the old do.body.
+;     ORIG3110: - do.body: float = 1.0,
+;     UR3110:   - do.body: float = 1.0,
+;     UR3110:   - do.body.1: float = 0.0{{(0000[0-9]*)?}},
+;     UR3110:   - do.body.2: float = 0.0{{(0000[0-9]*)?}},
+;
+;     UR3110: call void @f
+;     UR3110: br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR3110: call void @f
+;     UR3110: br i1 %{{.*}}, label %do.end, label %do.body.2, !prof !0
+;     UR3110: call void @f
+;     UR3110: br label %do.end
+;     UR3110: !0 = !{!"branch_weights", i32 1, i32 0}
+;
+;   Now use a constant iteration count so that all non-final unrolled
+;   iterations' latches unconditionally continue.
+;
+;     RUN: sed -e s/@MAX@/3/ -e s/@W@/0/ -e s/@MIN@/3/ -e s/@I_0@/0/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG3130
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR3130
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR3130
+;
+;     The new do.body contains 3 of the original loop's iterations, so multiply
+;     it by 3, which is greater than the old do.body, which is impossibly low.
+;     ORIG3130: - do.body: float = 1.0,
+;     UR3130:   - do.body: float = 1.0,
+;
+;     UR3130:     call void @f
+;     UR3130-NOT: br
+;     UR3130:     call void @f
+;     UR3130-NOT: br
+;     UR3130:     call void @f
+;     UR3130:     ret void
+;
+;   Use a constant iteration count but now the loop upper bound computation can
+;   overflow.  When it does, the loop induction variable is greater than it
+;   immediately, so the initial unrolled iteration's latch remains conditional.
+;
+;     RUN: sed -e s/@MAX@/3/ -e s/@W@/0/ -e s/@MIN@/3/ -e s/@I_0@/%x/ %s > %t.ll
+;     RUN: %{bf-fc} ORIG313x
+;     RUN: %{ur-bf} -unroll-count=3 | %{fc} UR313x
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} UR313x
+;
+;     The new do.body.1 contains 2 of the original loop's iterations, so
+;     multiply it by 2, and add the new do.body to get approximately the old
+;     do.body.
+;     ORIG313x: - do.body: float = 1.0,
+;     UR313x:   - do.body: float = 1.0,
+;     UR313x:   - do.body.1: float = 0.0{{(0000[0-9]*)?}},
+;
+;     UR313x:     call void @f
+;     UR313x:     br i1 %{{.*}}, label %do.end, label %do.body.1, !prof !0
+;     UR313x:     call void @f
+;     UR313x-NOT: br
+;     UR313x:     call void @f
+;     UR313x:     br label %do.end
+;     UR313x:     !0 = !{!"branch_weights", i32 1, i32 0}
+
+declare void @f(i32)
+
+define void @test(i32 %x, i32 %n) {
+entry:
+  %n.min = call i32 @llvm.umax.i32(i32 %n, i32 @MIN@)
+  %n.minmax = call i32 @llvm.umin.i32(i32 %n.min, i32 @MAX@)
+  %i_n = add i32 @I_0@, %n.minmax
+  br label %do.body
+
+do.body:
+  %i = phi i32 [ @I_0@, %entry ], [ %inc, %do.body ]
+  %inc = add i32 %i, 1
+  call void @f(i32 %i)
+  %c = icmp uge i32 %inc, %i_n
+  br i1 %c, label %do.end, label %do.body, !prof !0
+
+do.end:
+  ret void
+}
+
+; Loop body frequency is @W@ + 1.
+!0 = !{!"branch_weights", i32 1, i32 @W@}
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
index f5d05e666cabb..8b5a88bd6e8cd 100644
--- a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-epilog.ll
@@ -2,7 +2,14 @@
 ; frequencies after loop unrolling with an epilogue.
 ;
 ; We check various interesting unroll count values relative to the original
-; loop's body frequency of 11 (e.g., minimum and boundary values).
+; loop's body frequency of 11, and we check when the epilogue loop itself is and
+; is not unrolled.
+;
+; Without -unroll-remainder, the epilogue is unrolled only at -unroll-count=2
+; because there it has only 1 iteration and so is always completely unrolled.
+; With -unroll-remainder, for some reason related to computing the remainder in
+; two's complement, the epilogue is completely unrolled only when -unroll-count
+; is a power of 2.
 ;
 ; For each case, we check:
 ; - Iteration frequencies
@@ -14,13 +21,21 @@
 ;     overlook any other branch weights (no extra !prof or branch_weights).
 ;   - We also check the number of original loop bodies (represented by a call to
 ;     @f) that appear within each unrolled iteration.
+; - Branch weight metadata
+;   - Checking frequencies already checks whether the branch weights have the
+;     expected effect, but we also want to check the following.
+;   - Whether the epilogue loop is unrolled should not affect the unrolled
+;     loop's estimated trip count or the branch weights on the unrolled loop
+;     guard, unrolled loop latch, or epilogue loop guard.
+;   - We get uniform probabilities/weights (same !prof) across the epilogue
+;     iteration latches when expected.
 ; - llvm.loop.estimated_trip_count
-;   - For the unrolled and epilogue loops, must be the number of iterations
+;   - For the unrolled and epilogue loops, it must be the number of iterations
 ;     required for the original loop body to reach its original estimated trip
 ;     count, which is its original frequency, 11, because there is no prior
 ;     llvm.loop.estimated_trip_count.
-;   - Must not be blindly duplicated between the unrolled and epilogue loops.
-;   - Must not be blindly computed from any new latch branch weights.
+;   - It must not be blindly duplicated between the unrolled and epilogue loops.
+;   - It must not be blindly computed from any new latch branch weights.
 
 ; ------------------------------------------------------------------------------
 ; Verify that the test code produces the original loop body frequency we expect.
@@ -45,6 +60,7 @@
 ; Check -unroll-count=2.
 ;
 ; RUN: %{ur-bf} -unroll-count=2 | %{fc} UR2
+; RUN: %{ur-bf} -unroll-count=2 -unroll-remainder | %{fc} UR2
 ;
 ; Multiply do.body by 2 and add do.body.epil to get the original loop body
 ; frequency, 11.
@@ -72,22 +88,36 @@
 ; ------------------------------------------------------------------------------
 ; Check -unroll-count=4.
 ;
-; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4
+; RUN: %{ur-bf} -unroll-count=4 | %{fc} UR4,UR4-ELP
+; RUN: %{ur-bf} -unroll-count=4 -unroll-remainder | %{fc} UR4,UR4-EUR
 ;
-; Multiply do.body by 4 and add do.body.epil* to get the original loop body
-; frequency, 11.
-; UR4: - do.body: float = 2.3702,
-; UR4: - do.body.epil: float = 1.5193,
+; Multiply do.body by 4 and add do.body.epil* for either ELP or EUR to get the
+; original loop body frequency, 11.
+; UR4:     - do.body: float = 2.3702,
+; UR4-ELP: - do.body.epil: float = 1.5193,
+; FIXME: Should sum to 1.5193:
+; UR4-EUR: - do.body.epil: float = 0.78453,
+; UR4-EUR: - do.body.epil.1: float = 0.37941,
+; UR4-EUR: - do.body.epil.2: float = 0.18349,
 ;
 ; Unrolled loop guard, body, and latch.
 ; UR4: br i1 %{{.*}}, label %do.body.epil.preheader, label %entry.new, !prof !0
 ; UR4-COUNT-4: call void @f
 ; UR4: br i1 %{{.*}}, label %do.end.unr-lcssa, label %do.body, !prof !1, !llvm.loop !2
 ;
-; Epilogue guard and loop.
+; Epilogue guard.
 ; UR4: br i1 %{{.*}}, label %do.body.epil.preheader, label %do.end, !prof !5
-; UR4: call void @f
-; UR4: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
+;
+; Non-unrolled epilogue loop.
+; UR4-ELP: call void @f
+; UR4-ELP: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
+;
+; Completely unrolled epilogue loop.
+; UR4-EUR: call void @f
+; UR4-EUR: br i1 %{{.*}}, label %do.body.epil.1, label %do.end.epilog-lcssa, !prof !6
+; UR4-EUR: call void @f
+; UR4-EUR: br i1 %{{.*}}, label %do.body.epil.2, label %do.end.epilog-lcssa, !prof !6
+; UR4-EUR: call void @f
 ;
 ; Unrolled loop metadata.
 ; UR4: !0 = !{!"branch_weights", i32 534047398, i32 1613436250}
@@ -97,30 +127,69 @@
 ; UR4: !4 = !{!"llvm.loop.unroll.disable"}
 ; UR4: !5 = !{!"branch_weights", i32 1531603292, i32 615880356}
 ;
-; Epilogue loop metadata.
-; UR4: !6 = !{!"branch_weights", i32 1038564635, i32 1108919013}
-; UR4: !7 = distinct !{!7, !8, !4}
-; UR4: !8 = !{!"llvm.loop.estimated_trip_count", i32 3}
+; Non-unrolled epilogue loop metadata.
+; UR4-ELP: !6 = !{!"branch_weights", i32 1038564635, i32 1108919013}
+; UR4-ELP: !7 = distinct !{!7, !8, !4}
+; UR4-ELP: !8 = !{!"llvm.loop.estimated_trip_count", i32 3}
+;
+; Completely unrolled epilogue loop metadata.  Because it loses its backedge:
+; - The remaining conditional latches' branch weights must be adjusted relative
+;   to the non-unrolled case.  There are only two, so the implementation can
+;   compute uniform branch weights using the quadratic formula.
+; - It has no llvm.loop.estimated_trip_count.
+; UR4-EUR: !6 = !{!"branch_weights", i32 1038564635, i32 1108919013}
 
 ; ------------------------------------------------------------------------------
 ; Check -unroll-count=10.
 ;
-; RUN: %{ur-bf} -unroll-count=10 | %{fc} UR10
-;
-; Multiply do.body by 8 and add do.body.epil* to get the original loop body
-; frequency, 11.
-; UR10: - do.body: float = 0.6902,
-; UR10: - do.body.epil: float = 4.098,
+; RUN: %{ur-bf} -unroll-count=10 | %{fc} UR10,UR10-ELP
+; RUN: %{ur-bf} -unroll-count=10 -unroll-remainder | %{fc} UR10,UR10-EUR
+;
+; Multiply do.body by 10 and add do.body.epil* for either ELP or EUR to get the
+; original loop body frequency, 11.
+; UR10:     - do.body: float = 0.6902,
+; UR10-ELP: - do.body.epil: float = 4.098,
+; UR10-EUR: - do.body.epil: float = 1.0375,
+; UR10-EUR: - do.body.epil.1: float = 0.80019,
+; UR10-EUR: - do.body.epil.2: float = 0.61718,
+; UR10-EUR: - do.body.epil.3: float = 0.47602,
+; UR10-EUR: - do.body.epil.4: float = 0.36715,
+; UR10-EUR: - do.body.epil.5: float = 0.28318,
+; UR10-EUR: - do.body.epil.6: float = 0.21841,
+; UR10-EUR: - do.body.epil.7: float = 0.16846,
+; UR10-EUR: - do.body.epil.8: float = 0.12993,
 ;
 ; Unrolled loop guard, body, and latch.
 ; UR10: br i1 %{{.*}}, label %do.body.epil.preheader, label %entry.new, !prof !0
 ; UR10-COUNT-10: call void @f
 ; UR10: br i1 %{{.*}}, label %do.end.unr-lcssa, label %do.body, !prof !1, !llvm.loop !2
 ;
-; Epilogue guard and loop.
+; Epilogue guard.
 ; UR10: br i1 %{{.*}}, label %do.body.epil.preheader, label %do.end, !prof !5
-; UR10: call void @f
-; UR10: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
+;
+; Non-unrolled epilogue loop.
+; UR10-ELP: call void @f
+; UR10-ELP: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
+;
+; Partially unrolled epilogue loop.
+; UR10-EUR: call void @f
+; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.1, label %do.end.epilog-lcssa, !prof !6
+; UR10-EUR: call void @f
+; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.2, label %do.end.epilog-lcssa, !prof !6
+; UR10-EUR: call void @f
+; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.3, label %do.end.epilog-lcssa, !prof !6
+; UR10-EUR: call void @f
+; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.4, label %do.end.epilog-lcssa, !prof !6
+; UR10-EUR: call void @f
+; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.5, label %do.end.epilog-lcssa, !prof !6
+; UR10-EUR: call void @f
+; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.6, label %do.end.epilog-lcssa, !prof !6
+; UR10-EUR: call void @f
+; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.7, label %do.end.epilog-lcssa, !prof !6
+; UR10-EUR: call void @f
+; UR10-EUR: br i1 %{{.*}}, label %do.body.epil.8, label %do.end.epilog-lcssa, !prof !6
+; UR10-EUR: call void @f
+; UR10-EUR: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
 ;
 ; Unrolled loop metadata.
 ; UR10: !0 = !{!"branch_weights", i32 1236740947, i32 910742701}
@@ -130,30 +199,69 @@
 ; UR10: !4 = !{!"llvm.loop.unroll.disable"}
 ; UR10: !5 = !{!"branch_weights", i32 1829762672, i32 317720976}
 ;
-; Epilogue loop metadata.  Its llvm.loop.estimated_trip_count happens to be the
-; same as the unrolled loop's, so there's no new metadata node.
-; UR10: !6 = !{!"branch_weights", i32 1656332913, i32 491150735}
-; UR10: !7 = distinct !{!7, ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; The unrolled epilogue loop does not lose any conditional branches, so:
+; - The non-unrolled epilogue branch weights are shared across them.
+; - This is our first case where the unrolled epilogue loop has an
+;   llvm.loop.estimated_trip_count.  However, it happens to be the same as the
+;   unrolled loop's, so there's no new metadata node.
+; UR10:     !6 = !{!"branch_weights", i32 1656332913, i32 491150735}
+; UR10-ELP: !7 = distinct !{!7, !3, !4}
+; UR10-EUR: !7 = distinct !{!7, !3}
 
 ; ------------------------------------------------------------------------------
 ; Check -unroll-count=11.
 ;
-; RUN: %{ur-bf} -unroll-count=11 | %{fc} UR11
-;
-; Multiply do.body by 11 and add do.body.epil* to get the original loop body
-; frequency, 11.
-; UR11: - do.body: float = 0.59359,
-; UR11: - do.body.epil: float = 4.4705,
+; RUN: %{ur-bf} -unroll-count=11 | %{fc} UR11,UR11-ELP
+; RUN: %{ur-bf} -unroll-count=11 -unroll-remainder | %{fc} UR11,UR11-EUR
+;
+; Multiply do.body by 11 and add do.body.epil* for either ELP or EUR to get the
+; original loop body frequency, 11.
+; UR11:     - do.body: float = 0.59359,
+; UR11-ELP: - do.body.epil: float = 4.4705,
+; UR11-EUR: - do.body.epil: float =   1.0428,
+; UR11-EUR: - do.body.epil.1: float = 0.82209,
+; UR11-EUR: - do.body.epil.2: float = 0.64812,
+; UR11-EUR: - do.body.epil.3: float = 0.51097,
+; UR11-EUR: - do.body.epil.4: float = 0.40284,
+; UR11-EUR: - do.body.epil.5: float = 0.31759,
+; UR11-EUR: - do.body.epil.6: float = 0.25038,
+; UR11-EUR: - do.body.epil.7: float = 0.1974,
+; UR11-EUR: - do.body.epil.8: float = 0.15562,
+; UR11-EUR: - do.body.epil.9: float = 0.12269,
 ;
 ; Unrolled loop guard, body, and latch.
 ; UR11: br i1 %{{.*}}, label %do.body.epil.preheader, label %entry.new, !prof !0
 ; UR11-COUNT-11: call void @f
 ; UR11: br i1 %{{.*}}, label %do.end.unr-lcssa, label %do.body, !prof !1, !llvm.loop !2
-
-; Epilogue guard and loop.
+;
+; Epilogue guard.
 ; UR11: br i1 %{{.*}}, label %do.body.epil.preheader, label %do.end, !prof !5
-; UR11: call void @f
-; UR11: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
+;
+; Non-unrolled epilogue loop.
+; UR11-ELP: call void @f
+; UR11-ELP: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
+;
+; Partially unrolled epilogue loop.
+; UR11-EUR: call void @f
+; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.1, label %do.end.epilog-lcssa, !prof !6
+; UR11-EUR: call void @f
+; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.2, label %do.end.epilog-lcssa, !prof !6
+; UR11-EUR: call void @f
+; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.3, label %do.end.epilog-lcssa, !prof !6
+; UR11-EUR: call void @f
+; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.4, label %do.end.epilog-lcssa, !prof !6
+; UR11-EUR: call void @f
+; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.5, label %do.end.epilog-lcssa, !prof !6
+; UR11-EUR: call void @f
+; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.6, label %do.end.epilog-lcssa, !prof !6
+; UR11-EUR: call void @f
+; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.7, label %do.end.epilog-lcssa, !prof !6
+; UR11-EUR: call void @f
+; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.8, label %do.end.epilog-lcssa, !prof !6
+; UR11-EUR: call void @f
+; UR11-EUR: br i1 %{{.*}}, label %do.body.epil.9, label %do.end.epilog-lcssa, !prof !6
+; UR11-EUR: call void @f
+; UR11-EUR: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
 ;
 ; Unrolled loop metadata.
 ; UR11: !0 = !{!"branch_weights", i32 1319535738, i32 827947910}
@@ -163,30 +271,74 @@
 ; UR11: !4 = !{!"llvm.loop.unroll.disable"}
 ; UR11: !5 = !{!"branch_weights", i32 1846907894, i32 300575754}
 ;
-; Epilogue loop metadata.
-; UR11: !6 = !{!"branch_weights", i32 1693034047, i32 454449601}
-; UR11: !7 = distinct !{!7, !8, !4}
-; UR11: !8 = !{!"llvm.loop.estimated_trip_count", i32 0}
+; The unrolled epilogue loop does not lose any conditional branches, so:
+; - The non-unrolled epilogue branch weights are shared across them.
+; - The unrolled epilogue loop has an llvm.loop.estimated_trip_count.  This is
+;   our first case where it is different than the unrolled loop's, so it has its
+;   own metadata node.  But it happens to be the same as the non-unrolled
+;   epilogue loop's.
+; UR11:     !6 = !{!"branch_weights", i32 1693034047, i32 454449601}
+; UR11-ELP: !7 = distinct !{!7, !8, !4}
+; UR11-EUR: !7 = distinct !{!7, !8}
+; UR11:     !8 = !{!"llvm.loop.estimated_trip_count", i32 0}
 
 ; ------------------------------------------------------------------------------
 ; Check -unroll-count=12.
 ;
-; RUN: %{ur-bf} -unroll-count=12 | %{fc} UR12
-;
-; Multiply do.body by 12 and add do.body.epil* to get the original loop body
-; frequency, 11.
-; UR12: - do.body: float = 0.5144,
-; UR12: - do.body.epil: float = 4.8272,
+; RUN: %{ur-bf} -unroll-count=12 | %{fc} UR12,UR12-ELP
+; RUN: %{ur-bf} -unroll-count=12 -unroll-remainder | %{fc} UR12,UR12-EUR
+;
+; Multiply do.body by 12 and add do.body.epil* for either ELP or EUR to get the
+; original loop body frequency, 11.
+; UR12:     - do.body: float = 0.5144,
+; UR12-ELP: - do.body.epil: float = 4.8272,
+; UR12-EUR: - do.body.epil: float = 1.0463,
+; UR12-EUR: - do.body.epil.1: float = 0.83968,
+; UR12-EUR: - do.body.epil.2: float = 0.67387,
+; UR12-EUR: - do.body.epil.3: float = 0.5408,
+; UR12-EUR: - do.body.epil.4: float = 0.43401,
+; UR12-EUR: - do.body.epil.5: float = 0.3483,
+; UR12-EUR: - do.body.epil.6: float = 0.27952,
+; UR12-EUR: - do.body.epil.7: float = 0.22433,
+; UR12-EUR: - do.body.epil.8: float = 0.18003,
+; UR12-EUR: - do.body.epil.9: float = 0.14448,
+; UR12-EUR: - do.body.epil.10: float = 0.11595,
 ;
 ; Unrolled loop guard, body, and latch.
 ; UR12: br i1 %{{.*}}, label %do.body.epil.preheader, label %entry.new, !prof !0
 ; UR12-COUNT-12: call void @f
 ; UR12: br i1 %{{.*}}, label %do.end.unr-lcssa, label %do.body, !prof !1, !llvm.loop !2
 ;
-; Epilogue guard and loop.
+; Epilogue guard.
 ; UR12: br i1 %{{.*}}, label %do.body.epil.preheader, label %do.end, !prof !5
-; UR12: call void @f
-; UR12: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
+;
+; Non-unrolled epilogue loop.
+; UR12-ELP: call void @f
+; UR12-ELP: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
+;
+; Partially unrolled epilogue loop.
+; UR12-EUR: call void @f
+; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.1, label %do.end.epilog-lcssa, !prof !6
+; UR12-EUR: call void @f
+; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.2, label %do.end.epilog-lcssa, !prof !6
+; UR12-EUR: call void @f
+; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.3, label %do.end.epilog-lcssa, !prof !6
+; UR12-EUR: call void @f
+; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.4, label %do.end.epilog-lcssa, !prof !6
+; UR12-EUR: call void @f
+; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.5, label %do.end.epilog-lcssa, !prof !6
+; UR12-EUR: call void @f
+; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.6, label %do.end.epilog-lcssa, !prof !6
+; UR12-EUR: call void @f
+; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.7, label %do.end.epilog-lcssa, !prof !6
+; UR12-EUR: call void @f
+; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.8, label %do.end.epilog-lcssa, !prof !6
+; UR12-EUR: call void @f
+; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.9, label %do.end.epilog-lcssa, !prof !6
+; UR12-EUR: call void @f
+; UR12-EUR: br i1 %{{.*}}, label %do.body.epil.10, label %do.end.epilog-lcssa, !prof !6
+; UR12-EUR: call void @f
+; UR12-EUR: br i1 %{{.*}}, label %do.body.epil, label %do.end.epilog-lcssa, !prof !6, !llvm.loop !7
 ;
 ; Unrolled loop metadata.
 ; UR12: !0 = !{!"branch_weights", i32 1394803730, i32 752679918}
@@ -196,10 +348,16 @@
 ; UR12: !4 = !{!"llvm.loop.unroll.disable"}
 ; UR12: !5 = !{!"branch_weights", i32 1860963812, i32 286519836}
 ;
-; Epilogue loop metadata.
-; UR12: !6 = !{!"branch_weights", i32 1723419551, i32 424064097}
-; UR12: !7 = distinct !{!7, !8, !4}
-; UR12: !8 = !{!"llvm.loop.estimated_trip_count", i32 11}
+; The unrolled epilogue loop does not lose any conditional branches, so:
+; - The non-unrolled epilogue branch weights are shared across them.
+; - The unrolled epilogue loop has an llvm.loop.estimated_trip_count.  This is
+;   our first case where it is different than both the unrolled loop's and the
+;   non-unrolled epilogue loop's, so they all have distinct metadata nodes.
+; UR12:     !6 = !{!"branch_weights", i32 1723419551, i32 424064097}
+; UR12-ELP: !7 = distinct !{!7, !8, !4}
+; UR12-ELP: !8 = !{!"llvm.loop.estimated_trip_count", i32 11}
+; UR12-EUR: !7 = distinct !{!7, !8}
+; UR12-EUR: !8 = !{!"llvm.loop.estimated_trip_count", i32 1}
 
 declare void @f(i32)
 
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll
new file mode 100644
index 0000000000000..dafb2a3ca4ed9
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial-unconditional-latch.ll
@@ -0,0 +1,280 @@
+; Test branch weight metadata, estimated trip count metadata, and block
+; frequencies after partial loop unrolling without -unroll-runtime such that
+; some iterations' latches become unconditional, which often contradicts the
+; original branch weights.
+;
+; (unroll-complete.ll tests complete loop unrolling, in which the final unrolled
+; iteration unconditionally exits (backedge removed).  Here, we include cases
+; where the final iteration's latch unconditionally continues instead.)
+;
+; For each case, we check:
+; - Iteration frequencies
+;   - When each is multiplied by the number of original loop bodies that execute
+;     within it, they should sum to almost exactly the original loop body
+;     frequency.
+;   - The only exception is an impossibly high or low original frequency (e.g.,
+;     due to bad profile data), for which there exist no new branch weights that
+;     can yield that frequency sum.  In those cases, we expect the maximum or
+;     minimum possible frequency.
+; - CFGs
+;   - We verify which branch weights go with which branches and that we did not
+;     overlook any other branch weights (no extra !prof or branch_weights).
+;   - We also check the number of original loop bodies (represented by a call to
+;     @f) that appear within each unrolled iteration.
+; - Branch weight metadata
+;   - Checking frequencies already checks whether the branch weights have the
+;     expected effect, but we also want to check that we get uniform
+;     probabilities/weights (same !prof) across the unrolled iteration latches
+;     when expected.
+; - llvm.loop.estimated_trip_count
+;   - It must be the number of iterations of the unrolled loop required for the
+;     original loop body to reach its original frequency.
+;   - It must not be blindly computed from any new latch branch weights.
+
+; ------------------------------------------------------------------------------
+; Define LIT substitutions.
+;
+; For verifying that the test code produces the original loop body frequency we
+; expect.
+; DEFINE: %{bf-fc} = opt %t.ll -S -passes='print<block-freq>' 2>&1 | \
+; DEFINE:   FileCheck %s -check-prefixes
+;
+; For checking the unrolled loop:
+; DEFINE: %{ur-bf} = opt %t.ll -S -passes='loop-unroll,print<block-freq>' 2>&1
+; DEFINE: %{fc} = FileCheck %s \
+; DEFINE:     -implicit-check-not='llvm.loop.estimated_trip_count' \
+; DEFINE:     -implicit-check-not='!prof' \
+; DEFINE:     -implicit-check-not='branch_weights' \
+; DEFINE:     -implicit-check-not='call void @f' -check-prefixes
+
+; ------------------------------------------------------------------------------
+; Check cases when the original loop's number of iterations is a run-time
+; determined multiple of 10 and the original loop body frequency is 10.
+;
+;   RUN: sed -e s/@N@/%mul10/ -e s/@W@/9/ %s > %t.ll
+;
+; At compile time, possibilities for that value always include unroll count x 10
+; x N for any integer N >= 1, so the unrolled loop's backedge always remains
+; conditional, so we check cases where it becomes unconditional later in this
+; test file with the CONST4 config.
+;
+; Check the original loop body frequency.
+;
+;   RUN: %{bf-fc} MULT-ORIG
+;   MULT-ORIG: - do.body: float = 10.0,
+;
+; When the unroll count is odd, every iteration's latch remains conditional, so
+; their original probabilities are not contradicted.  That is, the original loop
+; latch's branch weights remain on all unrolled iterations' latches.
+;
+;   RUN: %{ur-bf} -unroll-count=3 | %{fc} MULT3
+;
+;   Sums to approximately the original loop body frequency, 10.
+;   MULT3: - do.body: float = 3.69,
+;   MULT3: - do.body.1: float = 3.321,
+;   MULT3: - do.body.2: float = 2.9889,
+;
+;   MULT3: call void @f
+;   MULT3: br i1 %{{.*}}, label %do.body.1, label %do.end, !prof !0
+;   MULT3: call void @f
+;   MULT3: br i1 %{{.*}}, label %do.body.2, label %do.end, !prof !0
+;   MULT3: call void @f
+;   MULT3: br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1
+;
+;   MULT3: !0 = !{!"branch_weights", i32 9, i32 1}
+;   MULT3: !1 = distinct !{!1, !2, !3}
+;   MULT3: !2 = !{!"llvm.loop.estimated_trip_count", i32 4}
+;   MULT3: !3 = !{!"llvm.loop.unroll.disable"}
+;
+; When the unroll count is even, odd-numbered unrolled iterations become
+; unconditional, so branch weights must be adjusted.
+;
+;   -unroll-count=2, so there is 1 remaining conditional latch, so the
+;   implementation can compute uniform weights by solving a linear equation.
+;
+;     RUN: %{ur-bf} -unroll-count=2 | %{fc} MULT2
+;
+;     Multiply by 2 to get the original loop body frequency, 10.
+;     FIXME: Should sum to 5.0:
+;     MULT2: - do.body: float = 10.0,
+;
+;     MULT2:     call void @f
+;     MULT2-NOT: br
+;     MULT2:     call void @f
+;     MULT2:     br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1{{$}}
+;
+;     The branch weights imply the estimated trip count is
+;     (1717986918+429496730)/429496730 = approximately (8+2)/2 = 5.
+;     FIXME: Or at least they should.
+;     MULT2: !0 = !{!"branch_weights", i32 9, i32 1}
+;     MULT2: !1 = distinct !{!1, !2, !3}
+;     MULT2: !2 = !{!"llvm.loop.estimated_trip_count", i32 5}
+;     MULT2: !3 = !{!"llvm.loop.unroll.disable"}
+;
+;   -unroll-count=4, so there are 2 remaining conditional latches, so the
+;   implementation can compute uniform weights using the quadratic formula.
+;
+;     RUN: %{ur-bf} -unroll-count=4 | %{fc} MULT4
+;
+;     Multiply by 2 and sum to get the original loop body frequency, 10.
+;     FIXME: Should sum to 5.0:
+;     MULT4: - do.body: float = 5.2632,
+;     MULT4: - do.body.2: float = 4.7368,
+;
+;     MULT4:     call void @f
+;     MULT4-NOT: br
+;     MULT4:     call void @f
+;     MULT4:     br i1 %{{.*}}, label %do.body.2, label %do.end, !prof !0
+;     MULT4:     call void @f
+;     MULT4-NOT: br
+;     MULT4:     call void @f
+;     MULT4:     br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1
+;
+;     MULT4 is like applying -unroll-count=2 to MULT2 without converting any
+;     more conditional latches to unconditional, so MULT2's branch weights work.
+;     MULT4: !0 = !{!"branch_weights", i32 9, i32 1}
+;     MULT4: !1 = distinct !{!1, !2, !3}
+;     MULT4: !2 = !{!"llvm.loop.estimated_trip_count", i32 3}
+;     MULT4: !3 = !{!"llvm.loop.unroll.disable"}
+
+; ------------------------------------------------------------------------------
+; Check case when the original loop's number of iterations is a run-time
+; determined multiple of 10, the unroll count is even so that odd-numbered
+; unrolled iterations become unconditional, and the original loop body frequency
+; is 1, which is impossibly low.  This case is important to ensure the
+; implementation does not malfunction by trying to use negative and possibly
+; infinite probabilities to reach the original loop body frequency.
+;
+;   RUN: sed -e s/@N@/%mul10/ -e s/@W@/0/ %s > %t.ll
+;
+; Check the original loop body frequency.
+;
+;   RUN: %{bf-fc} LOW-ORIG
+;   LOW-ORIG: - do.body: float = 1.0,
+;
+; -unroll-count=2, so there is 1 remaining conditional latch.  The
+; implementation tries to compute uniform weights by solving a linear equation
+; but ultimately sets the latch's probability to zero.
+;
+;   RUN: %{ur-bf} -unroll-count=2 | %{fc} LOW2
+;
+;   Multiply by 2, but the result is greater than the original loop body
+;   frequency, 1, which is impossibly low.
+;   LOW2: - do.body: float = 1.0,
+;
+;   LOW2:     call void @f
+;   LOW2-NOT: br
+;   LOW2:     call void @f
+;   LOW2:     br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1{{$}}
+;
+;   LOW2: !0 = !{!"branch_weights", i32 0, i32 1}
+;   LOW2: !1 = distinct !{!1, !2, !3}
+;   LOW2: !2 = !{!"llvm.loop.estimated_trip_count", i32 1}
+;   LOW2: !3 = !{!"llvm.loop.unroll.disable"}
+;
+; -unroll-count=4, so there are 2 remaining conditional latches.  The
+; implementation tries to compute uniform weights using the quadratic formula
+; but ultimately sets both latches' probabilities to zero.
+;
+;   RUN: %{ur-bf} -unroll-count=4 | %{fc} LOW4
+;
+;   Multiply by 2 and sum, but the result is greater than the original loop body
+;   frequency, 1, which is impossibly low.
+;   LOW4: - do.body: float = 1.0,
+;   LOW4: - do.body.2: float = 0.0{{(0000[0-9]*)?}},
+;
+;   LOW4:     call void @f
+;   LOW4-NOT: br
+;   LOW4:     call void @f
+;   LOW4:     br i1 %{{.*}}, label %do.body.2, label %do.end, !prof !0
+;   LOW4:     call void @f
+;   LOW4-NOT: br
+;   LOW4:     call void @f
+;   LOW4:     br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1
+;
+;   LOW4: !0 = !{!"branch_weights", i32 0, i32 1}
+;   LOW4: !1 = distinct !{!1, !2, !3}
+;   LOW4: !2 = !{!"llvm.loop.estimated_trip_count", i32 1}
+;   LOW4: !3 = !{!"llvm.loop.unroll.disable"}
+
+; ------------------------------------------------------------------------------
+; Check cases when the original loop's number of iterations is a constant 10 and
+; the original loop body frequency is 10.
+;
+;   RUN: sed -e s/@N@/10/g -e s/@W@/9/ %s > %t.ll
+;
+; Because we test only partial unrolling, there is always exactly one unrolled
+; iteration that can possibly exit, so only its latch can remain conditional.
+; Because there is only one, its branch weights can be computed with a simple
+; formula.
+;
+; Check the original loop body frequency.
+;
+;   RUN: %{bf-fc} CONST-ORIG
+;   CONST-ORIG: - do.body: float = 10.0,
+;
+; Check when the unrolled loop's backedge remains conditional.
+;
+;   RUN: %{ur-bf} -unroll-count=2 | %{fc} CONST2
+;
+;   Multiply by 2 to get the original loop body frequency, 10.
+;   FIXME: Should be 5.0:
+;   CONST2: - do.body: float = 10.0,
+;
+;   CONST2:     call void @f
+;   CONST2-NOT: br:
+;   CONST2:     call void @f
+;   CONST2:     br i1 %{{.*}}, label %do.body, label %do.end, !prof !0, !llvm.loop !1
+;
+;   Like MULT2.
+;   CONST2: !0 = !{!"branch_weights", i32 9, i32 1}
+;   CONST2: !1 = distinct !{!1, !2, !3}
+;   CONST2: !2 = !{!"llvm.loop.estimated_trip_count", i32 5}
+;   CONST2: !3 = !{!"llvm.loop.unroll.disable"}
+;
+; Check when the unrolled loop's backedge unconditionally continues.
+;
+;   RUN: %{ur-bf} -unroll-count=4 | %{fc} CONST4
+;
+;   Multiply by 2 and sum to get the original loop body frequency, 10.
+;   FIXME: Should sum to 5.0:
+;   CONST4: - do.body: float = 10.0,
+;   CONST4: - do.body.2: float = 9.0,
+;
+;   CONST4:     call void @f
+;   CONST4-NOT: br
+;   CONST4:     call void @f
+;   CONST4:     br i1 %{{.*}}, label %do.body.2, label %do.end, !prof !0
+;   CONST4:     call void @f
+;   CONST4-NOT: br
+;   CONST4:     call void @f
+;   CONST4:     br label %do.body, !llvm.loop !1
+;
+;   There is no llvm.loop.estimated_trip_count because the unrolled loop's latch
+;   in do.body.2 unconditionally continues.  The branch weights on do.body's
+;   branch imply do.body continues twice and then exits once, thus executing the
+;   original loop body 10 times.
+;   CONST4: !0 = !{!"branch_weights", i32 9, i32 1}
+;   CONST4: !1 = distinct !{!1, !2}
+;   CONST4: !2 = !{!"llvm.loop.unroll.disable"}
+
+declare void @f(i32)
+
+define void @test(i32 %n) {
+entry:
+  %mul10 = mul i32 %n, 10
+  br label %do.body
+
+do.body:
+  %i = phi i32 [ 0, %entry ], [ %next, %do.body ]
+  call void @f(i32 %i)
+  %next = add i32 %i, 1
+  %c = icmp ne i32 %next, @N@
+  br i1 %c, label %do.body, label %do.end, !prof !0
+
+do.end:
+  ret void
+}
+
+; Loop body frequency is @W@ + 1.
+!0 = !{!"branch_weights", i32 @W@, i32 1}
diff --git a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll
index af5342c5e35cd..ea6f4a4180fc9 100644
--- a/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll
+++ b/llvm/test/Transforms/LoopUnroll/branch-weights-freq/unroll-partial.ll
@@ -1,5 +1,6 @@
 ; Test branch weight metadata, estimated trip count metadata, and block
-; frequencies after partial loop unrolling without -unroll-runtime.
+; frequencies after partial loop unrolling without -unroll-runtime and without
+; converting any iteration's latch to an unconditional branch.
 
 ; ------------------------------------------------------------------------------
 ; RUN: opt < %s -S -passes='print<block-freq>' 2>&1 | \
diff --git a/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll
index 14f6da42df6b1..89915d29f5921 100644
--- a/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll
+++ b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll
@@ -1,73 +1,97 @@
 ; Check that a loop probability of one (indicating an always infinite loop) does
 ; not crash or otherwise break LoopUnroll behavior when it tries to compute new
 ; probabilities from it.
-;
-; That case indicates an always infinite loop.  A remainder loop cannot be
-; calculated at run time when the original loop is infinite as infinity %
-; UnrollCount is undefined, so consistent remainder loop probabilities are
-; difficult or impossible to reason about.  The implementation chooses
-; probabilities indicating that all remainder loop iterations will always
-; execute.
-
-; DEFINE: %{unroll} = opt < %s -unroll-count=3 -passes=loop-unroll -S
-; DEFINE: %{rt} = %{unroll} -unroll-runtime
-
-; RUN: %{unroll} | FileCheck %s -check-prefix UNROLL
-; RUN: %{rt} -unroll-runtime-epilog=true | FileCheck %s -check-prefix EPILOG
-; RUN: %{rt} -unroll-runtime-epilog=false | FileCheck %s -check-prefix PROLOG
-
-define void @test(i32 %n) {
-entry:
-  br label %loop
 
-loop:
-  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
-  %inc = add i32 %i, 1
-  %c = icmp slt i32 %inc, %n
-  br i1 %c, label %loop, label %end, !prof !0
+; DEFINE: %{unroll} = opt < %t.ll -unroll-count=3 -passes=loop-unroll -S
+; DEFINE: %{fc} = FileCheck %s \
+; DEFINE:     -implicit-check-not='llvm.loop.estimated_trip_count' \
+; DEFINE:     -implicit-check-not='!prof' \
+; DEFINE:     -implicit-check-not='branch_weights' \
+; DEFINE:     -implicit-check-not='call void @f' -check-prefixes
 
-end:
-  ret void
-}
+; ------------------------------------------------------------------------------
+; A partially unrolled loop remains infinite.
+;
+; RUN: sed -e s/@N@/%n/ %s > %t.ll
+; RUN: %{unroll} | %{fc} PART-ALL-COND
+;
+; PART-ALL-COND: call void @f
+; PART-ALL-COND: br i1 %{{.*}}, label %loop.1, label %end, !prof !0
+; PART-ALL-COND: call void @f
+; PART-ALL-COND: br i1 %{{.*}}, label %loop.2, label %end, !prof !0
+; PART-ALL-COND: call void @f
+; PART-ALL-COND: br i1 %{{.*}}, label %loop, label %end, !prof !0, !llvm.loop !1
+; PART-ALL-COND: !0 = !{!"branch_weights", i32 1, i32 0}
 
+; ------------------------------------------------------------------------------
+; A partially unrolled loop remains infinite even if some iterations' latches
+; become unconditional.
+;
+; RUN: sed -e s/@N@/5/ %s > %t.ll
+; RUN: %{unroll} | %{fc} PART-SOME-COND
+;
+; PART-SOME-COND:     call void @f
+; PART-SOME-COND-NOT: br
+; PART-SOME-COND:     call void @f
+; PART-SOME-COND:     br i1 %{{.*}}, label %loop.2, label %end, !prof !0
+; PART-SOME-COND:     call void @f
+; PART-SOME-COND:     br label %loop, !llvm.loop !1
+; PART-SOME-COND:     !0 = !{!"branch_weights", i32 1, i32 0}
 
-!0 = !{!"branch_weights", i32 1, i32 0}
+; ------------------------------------------------------------------------------
+; A completely unrolled loop cannot be infinite, so consistent unrolled loop
+; probabilities are impossible.  The implementation chooses probabilities
+; indicating that all unrolled loop iterations will always execute.
+;
+; RUN: sed -e s/@N@/%max3/ %s > %t.ll
+; RUN: %{unroll} | %{fc} COMPLETE-SOME-COND
+;
+; COMPLETE-SOME-COND: call void @f
+; COMPLETE-SOME-COND: br i1 %{{.*}}, label %loop.1, label %end, !prof !0
+; COMPLETE-SOME-COND: call void @f
+; COMPLETE-SOME-COND: br i1 %{{.*}}, label %loop.2, label %end, !prof !0
+; COMPLETE-SOME-COND: call void @f
+; COMPLETE-SOME-COND: br label %end
+; COMPLETE-SOME-COND: !0 = !{!"branch_weights", i32 1, i32 0}
 
-; UNROLL: define void @test(i32 %n) {
-; UNROLL: entry:
-; UNROLL:   br label %loop
-; UNROLL: loop:
-; UNROLL:   br i1 %c, label %loop.1, label %end, !prof !0
-; UNROLL: loop.1:
-; UNROLL:   br i1 %c.1, label %loop.2, label %end, !prof !0
-; UNROLL: loop.2:
-; UNROLL:   br i1 %c.2, label %loop, label %end, !prof !0, !llvm.loop !1
-; UNROLL-NOT: loop.3
-; UNROLL: end:
-; UNROLL:   ret void
-; UNROLL: }
-;
-; Infinite unrolled loop.
-; UNROLL: !0 = !{!"branch_weights", i32 1, i32 0}
+; ------------------------------------------------------------------------------
+; A completely unrolled loop with no remaining conditional latches gives the
+; implementation no probabilities to set.  Check that it still behaves.
+;
+; RUN: sed -e s/@N@/3/ %s > %t.ll
+; RUN: %{unroll} | %{fc} COMPLETE-NO-COND
+;
+; COMPLETE-NO-COND:     call void @f
+; COMPLETE-NO-COND-NOT: br
+; COMPLETE-NO-COND:     call void @f
+; COMPLETE-NO-COND-NOT: br
+; COMPLETE-NO-COND:     call void @f
 
-; EPILOG: define void @test(i32 %n) {
-; EPILOG: entry:
-; EPILOG:   br i1 %{{.*}}, label %loop.epil.preheader, label %entry.new, !prof !0
-; EPILOG: entry.new:
-; EPILOG:   br label %loop
-; EPILOG: loop:
-; EPILOG:   br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !1
-; EPILOG: end.unr-lcssa:
-; EPILOG:   br i1 %{{.*}}, label %loop.epil.preheader, label %end, !prof !1
-; EPILOG: loop.epil.preheader:
-; EPILOG:   br label %loop.epil
-; EPILOG: loop.epil:
-; EPILOG:   br i1 %{{.*}}, label %loop.epil, label %end.epilog-lcssa, !prof !4
-; EPILOG: end.epilog-lcssa:
-; EPILOG:   br label %end
-; EPILOG: end:
-; EPILOG:   ret void
-; EPILOG: }
+; ------------------------------------------------------------------------------
+; A remainder loop cannot be calculated at run time when the original loop is
+; infinite as infinity % UnrollCount is undefined, so consistent remainder loop
+; probabilities are difficult or impossible to reason about.  The implementation
+; chooses probabilities indicating that all remainder loop iterations will
+; always execute.
+;
+; RUN: sed -e s/@N@/%n/ %s > %t.ll
+; DEFINE: %{rt} = %{unroll} -unroll-runtime
+; RUN: %{rt} -unroll-runtime-epilog=true | %{fc} EPILOG
+; RUN: %{rt} -unroll-runtime-epilog=false | %{fc} PROLOG
+;
+; Unrolled loop guard, body, and latch.
+; EPILOG:     br i1 %{{.*}}, label %loop.epil.preheader, label %entry.new, !prof !0
+; EPILOG:     call void @f
+; EPILOG-NOT: br
+; EPILOG:     call void @f
+; EPILOG-NOT: br
+; EPILOG:     call void @f
+; EPILOG:     br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !1
+;
+; Epilogue guard, body, and latch.
+; EPILOG: br i1 %{{.*}}, label %loop.epil.preheader, label %end, !prof !1
+; EPILOG: call void @f
+; EPILOG: br i1 %{{.*}}, label %loop.epil, label %end.epilog-lcssa, !prof !4
 ;
 ; Unrolled loop guard: Unrolled loop is always entered.
 ; EPILOG: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
@@ -78,27 +102,20 @@ end:
 ;
 ; Epilogue loop latch: Epilogue loop executes both of its 2 iterations.
 ; EPILOG: !4 = !{!"branch_weights", i32 1073741824, i32 1073741824}
-
-; PROLOG: define void @test(i32 %n) {
-; PROLOG: entry:
-; PROLOG:   br i1 %{{.*}}, label %loop.prol.preheader, label %loop.prol.loopexit, !prof !0
-; PROLOG: loop.prol.preheader:
-; PROLOG:   br label %loop.prol
-; PROLOG: loop.prol:
-; PROLOG:   br i1 %{{.*}}, label %loop.prol, label %loop.prol.loopexit.unr-lcssa, !prof !1
-; PROLOG: loop.prol.loopexit.unr-lcssa:
-; PROLOG:   br label %loop.prol.loopexit
-; PROLOG: loop.prol.loopexit:
-; PROLOG:   br i1 %{{.*}}, label %end, label %entry.new, !prof !0
-; PROLOG: entry.new:
-; PROLOG:   br label %loop
-; PROLOG: loop:
-; PROLOG:   br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !4
-; PROLOG: end.unr-lcssa:
-; PROLOG:   br label %end
-; PROLOG: end:
-; PROLOG:   ret void
-; PROLOG: }
+;
+; Prologue guard, body, and latch.
+; PROLOG: br i1 %{{.*}}, label %loop.prol.preheader, label %loop.prol.loopexit, !prof !0
+; PROLOG: call void @f
+; PROLOG: br i1 %{{.*}}, label %loop.prol, label %loop.prol.loopexit.unr-lcssa, !prof !1
+;
+; Unrolled loop guard, body, and latch.
+; PROLOG:     br i1 %{{.*}}, label %end, label %entry.new, !prof !0
+; PROLOG:     call void @f
+; PROLOG-NOT: br
+; PROLOG:     call void @f
+; PROLOG-NOT: br
+; PROLOG:     call void @f
+; PROLOG:     br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !4
 ;
 ; FIXME: Branch weights still need to be fixed in the case of prologues (issue
 ; #135812), so !0 and !1 do not yet match their comments below.  When we do
@@ -114,3 +131,23 @@ end:
 ;
 ; Unrolled loop latch: Unrolled loop is infinite.
 ; PROLOG: !4 = !{!"branch_weights", i32 1, i32 0}
+
+declare void @f(i32)
+
+define void @test(i32 %n) {
+entry:
+  %max3 = call i32 @llvm.umin.i32(i32 %n, i32 3)
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  call void @f(i32 %i)
+  %inc = add i32 %i, 1
+  %c = icmp slt i32 %inc, @N@
+  br i1 %c, label %loop, label %end, !prof !0
+
+end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 0}