[llvm] [SimpleLoopUnswitch] Adjust cost multiplier accounting for parent loop size (PR #155379)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 26 02:02:21 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Antonio Frighetto (antoniofrighetto)
<details>
<summary>Changes</summary>
When estimating the cost to avoid exponential unswitches of non-trivial invariant conditions, also consider the parent loop basic blocks size, ensuring this does not grow unexpectedly.
Alternative solution might involve adding a threshold on the number of times LoopRotate is allowed to rotate the same loop across invocations (https://github.com/llvm/llvm-project/issues/138509#issuecomment-3193855772), though I think this would require a per-loop count to be serialized/deserialized in a metadata each time, which seems far more invasive.
Fixes: https://github.com/llvm/llvm-project/issues/138509.
---
Full diff: https://github.com/llvm/llvm-project/pull/155379.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp (+22-5)
- (added) llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll (+54)
``````````diff
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 9b40fc03da6bb..e4ba70d1bce16 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -98,6 +98,9 @@ static cl::opt<bool> EnableUnswitchCostMultiplier(
static cl::opt<int> UnswitchSiblingsToplevelDiv(
"unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
cl::desc("Toplevel siblings divisor for cost multiplier."));
+static cl::opt<int> UnswitchParentBlocksDiv(
+ "unswitch-parent-blocks-div", cl::init(8), cl::Hidden,
+ cl::desc("Outer loop size divisor for cost multiplier."));
static cl::opt<int> UnswitchNumInitialUnscaledCandidates(
"unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
cl::desc("Number of unswitch candidates that are ignored when calculating "
@@ -2809,9 +2812,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
}
/// Cost multiplier is a way to limit potentially exponential behavior
-/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
-/// candidates available. Also accounting for the number of "sibling" loops with
-/// the idea to account for previous unswitches that already happened on this
+/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch
+/// candidates available. Also consider the number of "sibling" loops with
+/// the idea of accounting for previous unswitches that already happened on this
/// cluster of loops. There was an attempt to keep this formula simple,
/// just enough to limit the worst case behavior. Even if it is not that simple
/// now it is still not an attempt to provide a detailed heuristic size
@@ -2842,7 +2845,19 @@ static int CalculateUnswitchCostMultiplier(
return 1;
}
+ // Each invariant non-trivial condition, after being unswitched, is supposed
+ // to have its own specialized sibling loop (the invariant condition has been
+ // hoisted out of the child loop into a newly-cloned loop). When unswitching
+ // conditions in nested loops, the basic block size of the outer loop should
+ // not be altered. If such a size significantly increases across unswitching
+ // invocations, something may be wrong; so adjust the final cost taking this
+ // into account.
auto *ParentL = L.getParentLoop();
+ int ParentLoopSizeMultiplier = 1;
+ if (ParentL)
+ ParentLoopSizeMultiplier =
+ std::max<int>(ParentL->getNumBlocks() / UnswitchParentBlocksDiv, 1);
+
int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
: std::distance(LI.begin(), LI.end()));
// Count amount of clones that all the candidates might cause during
@@ -2887,14 +2902,16 @@ static int CalculateUnswitchCostMultiplier(
// at an upper bound.
int CostMultiplier;
if (ClonesPower > Log2_32(UnswitchThreshold) ||
- SiblingsMultiplier > UnswitchThreshold)
+ SiblingsMultiplier > UnswitchThreshold ||
+ ParentLoopSizeMultiplier > UnswitchThreshold)
CostMultiplier = UnswitchThreshold;
else
CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
(int)UnswitchThreshold);
LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier
- << " (siblings " << SiblingsMultiplier << " * clones "
+ << " (siblings " << SiblingsMultiplier << " * parent size "
+ << ParentLoopSizeMultiplier << " * clones "
<< (1 << ClonesPower) << ")"
<< " for unswitch candidate: " << TI << "\n");
return CostMultiplier;
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll b/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll
new file mode 100644
index 0000000000000..2ce0cb5c3f299
--- /dev/null
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -enable-unswitch-cost-multiplier=true -unswitch-parent-blocks-div=1 \
+; RUN: -passes="loop-mssa(loop-simplifycfg,licm,loop-rotate,simple-loop-unswitch<nontrivial>),print<loops>" \
+; RUN: -disable-output 2>&1 | sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-DIV-1
+
+; RUN: opt < %s -S -enable-unswitch-cost-multiplier=true -unswitch-parent-blocks-div=2 \
+; RUN: -passes="loop-mssa(loop-simplifycfg,licm,loop-rotate,simple-loop-unswitch<nontrivial>),print<loops>" \
+; RUN: -disable-output 2>&1 | sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-DIV-2
+
+; LOOP-DIV-1-COUNT-6: Loop at depth 1 containing:
+; LOOP-DIV-2-COUNT-12: Loop at depth 1 containing:
+
+ at a = global i32 0, align 4
+ at b = global i32 0, align 4
+ at c = global i32 0, align 4
+ at d = global i32 0, align 4
+
+define i32 @main() {
+entry:
+ br label %outer.loop.header
+
+outer.loop.header: ; preds = %outer.loop.latch, %entry
+ br i1 false, label %exit, label %outer.loop.body
+
+outer.loop.body: ; preds = %inner.loop.header, %outer.loop.header
+ store i32 1, ptr @c, align 4
+ %cmp = icmp sgt i32 0, -1
+ br i1 %cmp, label %outer.loop.latch, label %exit
+
+inner.loop.header: ; preds = %outer.loop.latch, %inner.loop.body
+ %a_val = load i32, ptr @a, align 4
+ %c_val = load i32, ptr @c, align 4
+ %mul = mul nsw i32 %c_val, %a_val
+ store i32 %mul, ptr @b, align 4
+ %cmp2 = icmp sgt i32 %mul, -1
+ br i1 %cmp2, label %inner.loop.body, label %outer.loop.body
+
+inner.loop.body: ; preds = %inner.loop.header
+ %mul2 = mul nsw i32 %c_val, 3
+ store i32 %mul2, ptr @c, align 4
+ store i32 %c_val, ptr @d, align 4
+ %mul3 = mul nsw i32 %c_val, %a_val
+ %cmp3 = icmp sgt i32 %mul3, -1
+ br i1 %cmp3, label %inner.loop.header, label %exit
+
+outer.loop.latch: ; preds = %outer.loop.body
+ %d_val = load i32, ptr @d, align 4
+ store i32 %d_val, ptr @b, align 4
+ %cmp4 = icmp eq i32 %d_val, 0
+ br i1 %cmp4, label %inner.loop.header, label %outer.loop.header
+
+exit: ; preds = %inner.loop.body, %outer.loop.body, %outer.loop.header
+ ret i32 0
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/155379
More information about the llvm-commits
mailing list