[llvm] LoopRotationUtils: Fix underflow for zero-branch weights (PR #66681)
Matthias Braun via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 18 11:50:52 PDT 2023
https://github.com/MatzeB updated https://github.com/llvm/llvm-project/pull/66681
>From 7a590d778a9075a766aae75ec3795d4e34e2e3ef Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze at braunis.de>
Date: Mon, 18 Sep 2023 11:20:53 -0700
Subject: [PATCH] LoopRotationUtils: Special case zero-branch weight cases
The formula I added to LoopRotationUtils does not produce reasonable
results if some of the branch weights are zero. Add special case
handling for this.
---
.../Transforms/Utils/LoopRotationUtils.cpp | 75 +++++++++-----
.../LoopRotate/update-branch-weights.ll | 97 +++++++++++++++++++
2 files changed, 149 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 22effcf7d88afd2..012aa5dbb9ca004 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -295,33 +295,62 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
// We cannot generally deduce how often we had a zero-trip count loop so we
// have to make a guess for how to distribute x among the new x0 and x1.
- uint32_t ExitWeight0 = 0; // aka x0
- if (HasConditionalPreHeader) {
- // Here we cannot know how many 0-trip count loops we have, so we guess:
- if (OrigLoopBackedgeWeight > OrigLoopExitWeight) {
- // If the loop count is bigger than the exit count then we set
- // probabilities as if 0-trip count nearly never happens.
- ExitWeight0 = ZeroTripCountWeights[0];
- // Scale up counts if necessary so we can match `ZeroTripCountWeights` for
- // the `ExitWeight0`:`ExitWeight1` (aka `x0`:`x1` ratio`) ratio.
- while (OrigLoopExitWeight < ZeroTripCountWeights[1] + ExitWeight0) {
- // ... but don't overflow.
- uint32_t const HighBit = uint32_t{1} << (sizeof(uint32_t) * 8 - 1);
- if ((OrigLoopBackedgeWeight & HighBit) != 0 ||
- (OrigLoopExitWeight & HighBit) != 0)
- break;
- OrigLoopBackedgeWeight <<= 1;
- OrigLoopExitWeight <<= 1;
+ uint32_t ExitWeight0; // aka x0
+ uint32_t ExitWeight1; // aka x1
+ uint32_t EnterWeight; // aka y0
+ uint32_t LoopBackWeight; // aka y1
+ if (OrigLoopExitWeight > 0 && OrigLoopBackedgeWeight > 0) {
+ ExitWeight0 = 0;
+ if (HasConditionalPreHeader) {
+ // Here we cannot know how many 0-trip count loops we have, so we guess:
+ if (OrigLoopBackedgeWeight >= OrigLoopExitWeight) {
+ // If the loop count is bigger than the exit count then we set
+ // probabilities as if 0-trip count nearly never happens.
+ ExitWeight0 = ZeroTripCountWeights[0];
+ // Scale up counts if necessary so we can match `ZeroTripCountWeights`
+ // for the `ExitWeight0`:`ExitWeight1` (aka `x0`:`x1` ratio`) ratio.
+ while (OrigLoopExitWeight < ZeroTripCountWeights[1] + ExitWeight0) {
+ // ... but don't overflow.
+ uint32_t const HighBit = uint32_t{1} << (sizeof(uint32_t) * 8 - 1);
+ if ((OrigLoopBackedgeWeight & HighBit) != 0 ||
+ (OrigLoopExitWeight & HighBit) != 0)
+ break;
+ OrigLoopBackedgeWeight <<= 1;
+ OrigLoopExitWeight <<= 1;
+ }
+ } else {
+ // If there's a higher exit-count than backedge-count then we set
+ // probabilities as if there are only 0-trip and 1-trip cases.
+ ExitWeight0 = OrigLoopExitWeight - OrigLoopBackedgeWeight;
}
+ }
+ ExitWeight1 = OrigLoopExitWeight - ExitWeight0;
+ EnterWeight = ExitWeight1;
+ LoopBackWeight = OrigLoopBackedgeWeight - EnterWeight;
+ } else if (OrigLoopExitWeight == 0) {
+ if (OrigLoopBackedgeWeight == 0) {
+ // degenerate case... keep everything zero...
+ ExitWeight0 = 0;
+ ExitWeight1 = 0;
+ EnterWeight = 0;
+ LoopBackWeight = 0;
} else {
- // If there's a higher exit-count than backedge-count then we set
- // probabilities as if there are only 0-trip and 1-trip cases.
- ExitWeight0 = OrigLoopExitWeight - OrigLoopBackedgeWeight;
+ // Special case "LoopExitWeight == 0" weights which behaves like an
+ // endless where we don't want loop-enttry (y0) to be the same as
+ // loop-exit (x1).
+ ExitWeight0 = 0;
+ ExitWeight1 = 0;
+ EnterWeight = 1;
+ LoopBackWeight = OrigLoopBackedgeWeight;
}
+ } else {
+ // loop is never entered.
+ assert(OrigLoopBackedgeWeight == 0 && "remaining case is backedge zero");
+ ExitWeight0 = 1;
+ ExitWeight1 = 1;
+ EnterWeight = 0;
+ LoopBackWeight = 0;
}
- uint32_t ExitWeight1 = OrigLoopExitWeight - ExitWeight0; // aka x1
- uint32_t EnterWeight = ExitWeight1; // aka y0
- uint32_t LoopBackWeight = OrigLoopBackedgeWeight - EnterWeight; // aka y1
MDBuilder MDB(LoopBI.getContext());
MDNode *LoopWeightMD =
diff --git a/llvm/test/Transforms/LoopRotate/update-branch-weights.ll b/llvm/test/Transforms/LoopRotate/update-branch-weights.ll
index 9af6cfab4a2411d..f587ed99ab84daa 100644
--- a/llvm/test/Transforms/LoopRotate/update-branch-weights.ll
+++ b/llvm/test/Transforms/LoopRotate/update-branch-weights.ll
@@ -23,6 +23,7 @@
; BFI_AFTER: - inner_loop_exit: {{.*}} count = 1000
; BFI_AFTER: - outer_loop_exit: {{.*}} count = 1
+; IR-LABEL: define void @func0
; IR: inner_loop_body:
; IR: br i1 %cmp1, label %inner_loop_body, label %inner_loop_exit, !prof [[PROF_FUNC0_0:![0-9]+]]
; IR: inner_loop_exit:
@@ -74,6 +75,7 @@ outer_loop_exit:
; BFI_AFTER: - loop_header.loop_exit_crit_edge: {{.*}} count = 1024
; BFI_AFTER: - loop_exit: {{.*}} count = 1024
+; IR-LABEL: define void @func1
; IR: entry:
; IR: br i1 %cmp1, label %loop_body.lr.ph, label %loop_exit, !prof [[PROF_FUNC1_0:![0-9]+]]
@@ -114,6 +116,7 @@ loop_exit:
; - loop_header.loop_exit_crit_edge: {{.*}} count = 32
; - loop_exit: {{.*}} count = 1024
+; IR-LABEL: define void @func2
; IR: entry:
; IR: br i1 %cmp1, label %loop_exit, label %loop_body.lr.ph, !prof [[PROF_FUNC2_0:![0-9]+]]
@@ -141,12 +144,103 @@ loop_exit:
ret void
}
+; BFI_BEFORE-LABEL: block-frequency-info: func3_zero_branch_weight
+; BFI_BEFORE: - entry: {{.*}} count = 1024
+; BFI_BEFORE: - loop_header: {{.*}} count = 2199023255296
+; BFI_BEFORE: - loop_body: {{.*}} count = 2199023254272
+; BFI_BEFORE: - loop_exit: {{.*}} count = 1024
+
+; BFI_AFTER-LABEL: block-frequency-info: func3_zero_branch_weight
+; BFI_AFTER: - entry: {{.*}} count = 1024
+; BFI_AFTER: - loop_body.lr.ph: {{.*}} count = 1024
+; BFI_AFTER: - loop_body: {{.*}} count = 2199023255296
+; BFI_AFTER: - loop_header.loop_exit_crit_edge: {{.*}} count = 1024
+; BFI_AFTER: - loop_exit: {{.*}} count = 1024
+
+; IR-LABEL: define void @func3_zero_branch_weight
+; IR: entry:
+; IR: br i1 %cmp1, label %loop_exit, label %loop_body.lr.ph, !prof [[PROF_FUNC3_0:![0-9]+]]
+
+; IR: loop_body:
+; IR: br i1 %cmp, label %loop_header.loop_exit_crit_edge, label %loop_body, !prof [[PROF_FUNC3_0]]
+
+define void @func3_zero_branch_weight(i32 %n) !prof !3 {
+entry:
+ br label %loop_header
+
+loop_header:
+ %i = phi i32 [0, %entry], [%i_inc, %loop_body]
+ %cmp = icmp slt i32 %i, %n
+ br i1 %cmp, label %loop_exit, label %loop_body, !prof !6
+
+loop_body:
+ store volatile i32 %i, ptr @g, align 4
+ %i_inc = add i32 %i, 1
+ br label %loop_header
+
+loop_exit:
+ ret void
+}
+
+; IR-LABEL: define void @func4_zero_branch_weight
+; IR: entry:
+; IR: br i1 %cmp1, label %loop_exit, label %loop_body.lr.ph, !prof [[PROF_FUNC4_0:![0-9]+]]
+
+; IR: loop_body:
+; IR: br i1 %cmp, label %loop_header.loop_exit_crit_edge, label %loop_body, !prof [[PROF_FUNC4_0]]
+
+define void @func4_zero_branch_weight(i32 %n) !prof !3 {
+entry:
+ br label %loop_header
+
+loop_header:
+ %i = phi i32 [0, %entry], [%i_inc, %loop_body]
+ %cmp = icmp slt i32 %i, %n
+ br i1 %cmp, label %loop_exit, label %loop_body, !prof !7
+
+loop_body:
+ store volatile i32 %i, ptr @g, align 4
+ %i_inc = add i32 %i, 1
+ br label %loop_header
+
+loop_exit:
+ ret void
+}
+
+; IR-LABEL: define void @func5_zero_branch_weight
+; IR: entry:
+; IR: br i1 %cmp1, label %loop_exit, label %loop_body.lr.ph, !prof [[PROF_FUNC5_0:![0-9]+]]
+
+; IR: loop_body:
+; IR: br i1 %cmp, label %loop_header.loop_exit_crit_edge, label %loop_body, !prof [[PROF_FUNC5_0]]
+
+define void @func5_zero_branch_weight(i32 %n) !prof !3 {
+entry:
+ br label %loop_header
+
+loop_header:
+ %i = phi i32 [0, %entry], [%i_inc, %loop_body]
+ %cmp = icmp slt i32 %i, %n
+ br i1 %cmp, label %loop_exit, label %loop_body, !prof !8
+
+loop_body:
+ store volatile i32 %i, ptr @g, align 4
+ %i_inc = add i32 %i, 1
+ br label %loop_header
+
+loop_exit:
+ ret void
+}
+
!0 = !{!"function_entry_count", i64 1}
!1 = !{!"branch_weights", i32 1000, i32 1}
!2 = !{!"branch_weights", i32 3000, i32 1000}
!3 = !{!"function_entry_count", i64 1024}
!4 = !{!"branch_weights", i32 40, i32 2}
!5 = !{!"branch_weights", i32 10240, i32 320}
+!6 = !{!"branch_weights", i32 0, i32 1}
+!7 = !{!"branch_weights", i32 1, i32 0}
+!8 = !{!"branch_weights", i32 0, i32 0}
; IR: [[PROF_FUNC0_0]] = !{!"branch_weights", i32 2000, i32 1000}
; IR: [[PROF_FUNC0_1]] = !{!"branch_weights", i32 999, i32 1}
@@ -154,3 +248,6 @@ loop_exit:
; IR: [[PROF_FUNC1_1]] = !{!"branch_weights", i32 2433, i32 127}
; IR: [[PROF_FUNC2_0]] = !{!"branch_weights", i32 9920, i32 320}
; IR: [[PROF_FUNC2_1]] = !{!"branch_weights", i32 320, i32 0}
+; IR: [[PROF_FUNC3_0]] = !{!"branch_weights", i32 0, i32 1}
+; IR: [[PROF_FUNC4_0]] = !{!"branch_weights", i32 1, i32 0}
+; IR: [[PROF_FUNC5_0]] = !{!"branch_weights", i32 0, i32 0}
More information about the llvm-commits
mailing list