[llvm] r347097 - [SimpleLoopUnswitch] adding cost multiplier to cap exponential unswitch with

Fri Nov 16 13:16:43 PST 2018

Author: fedor.sergeev
Date: Fri Nov 16 13:16:43 2018
New Revision: 347097

URL: http://llvm.org/viewvc/llvm-project?rev=347097&view=rev
Log:
[SimpleLoopUnswitch] adding cost multiplier to cap exponential unswitch with

We need to control exponential behavior of loop-unswitch so we do not get
run-away compilation.

Suggested solution is to introduce a multiplier for an unswitch cost that
makes cost prohibitive as soon as there are too many candidates and too
many sibling loops (meaning we have already started duplicating loops
by unswitching).

It does solve the currently known problem with compile-time degradation
(PR 39544).

Tests are built on top of a recently implemented CHECK-COUNT-<num>
FileCheck directives.

Reviewed By: chandlerc, mkazantsev
Differential Revision: https://reviews.llvm.org/D54223

Added:
    llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll
    llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll
    llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll
    llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch2.ll
    llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll
Modified:
    llvm/trunk/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp

Modified: llvm/trunk/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp?rev=347097&r1=347096&r2=347097&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp Fri Nov 16 13:16:43 2018
@@ -62,6 +62,9 @@ STATISTIC(NumBranches, "Number of branch
 STATISTIC(NumSwitches, "Number of switches unswitched");
 STATISTIC(NumGuards, "Number of guards turned into branches for unswitching");
 STATISTIC(NumTrivial, "Number of unswitches that are trivial");
+STATISTIC(
+    NumCostMultiplierSkipped,
+    "Number of unswitch candidates that had their cost multiplier skipped");
 
 static cl::opt<bool> EnableNonTrivialUnswitch(
     "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
@@ -72,6 +75,17 @@ static cl::opt<int>
     UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
                       cl::desc("The cost threshold for unswitching a loop."));
 
+static cl::opt<bool> EnableUnswitchCostMultiplier(
+    "enable-unswitch-cost-multiplier", cl::init(true), cl::Hidden,
+    cl::desc("Enable unswitch cost multiplier that prohibits exponential "
+             "explosion in nontrivial unswitch."));
+static cl::opt<int> UnswitchSiblingsToplevelDiv(
+    "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
+    cl::desc("Toplevel siblings divisor for cost multiplier."));
+static cl::opt<int> UnswitchNumInitialUnscaledCandidates(
+    "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
+    cl::desc("Number of unswitch candidates that are ignored when calculating "
+             "cost multiplier."));
 static cl::opt<bool> UnswitchGuards(
     "simple-loop-unswitch-guards", cl::init(true), cl::Hidden,
     cl::desc("If enabled, simple loop unswitching will also consider "
@@ -2260,6 +2274,91 @@ turnGuardIntoBranch(IntrinsicInst *GI, L
   return CheckBI;
 }
 
+/// Cost multiplier is a way to limit potentially exponential behavior
+/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
+/// candidates available. Also accounting for the number of "sibling" loops with
+/// the idea to account for previous unswitches that already happened on this
+/// cluster of loops. There was an attempt to keep this formula simple,
+/// just enough to limit the worst case behavior. Even if it is not that simple
+/// now it is still not an attempt to provide a detailed heuristic size
+/// prediction.
+///
+/// TODO: Make a proper accounting of "explosion" effect for all kinds of
+/// unswitch candidates, making adequate predictions instead of wild guesses.
+/// That requires knowing not just the number of "remaining" candidates but
+/// also costs of unswitching for each of these candidates.
+static int calculateUnswitchCostMultiplier(
+    Instruction &TI, Loop &L, LoopInfo &LI, DominatorTree &DT,
+    ArrayRef<std::pair<Instruction *, TinyPtrVector<Value *>>>
+        UnswitchCandidates) {
+
+  // Guards and other exiting conditions do not contribute to exponential
+  // explosion as soon as they dominate the latch (otherwise there might be
+  // another path to the latch remaining that does not allow to eliminate the
+  // loop copy on unswitch).
+  BasicBlock *Latch = L.getLoopLatch();
+  BasicBlock *CondBlock = TI.getParent();
+  if (DT.dominates(CondBlock, Latch) &&
+      (isGuard(&TI) ||
+       llvm::count_if(successors(&TI), [&L](BasicBlock *SuccBB) {
+         return L.contains(SuccBB);
+       }) <= 1)) {
+    NumCostMultiplierSkipped++;
+    return 1;
+  }
+
+  auto *ParentL = L.getParentLoop();
+  int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
+                               : std::distance(LI.begin(), LI.end()));
+  // Count amount of clones that all the candidates might cause during
+  // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases.
+  int UnswitchedClones = 0;
+  for (auto Candidate : UnswitchCandidates) {
+    Instruction *CI = Candidate.first;
+    BasicBlock *CondBlock = CI->getParent();
+    bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch);
+    if (isGuard(CI)) {
+      if (!SkipExitingSuccessors)
+        UnswitchedClones++;
+      continue;
+    }
+    int NonExitingSuccessors = llvm::count_if(
+        successors(CondBlock), [SkipExitingSuccessors, &L](BasicBlock *SuccBB) {
+          return !SkipExitingSuccessors || L.contains(SuccBB);
+        });
+    UnswitchedClones += Log2_32(NonExitingSuccessors);
+  }
+
+  // Ignore up to the "unscaled candidates" number of unswitch candidates
+  // when calculating the power-of-two scaling of the cost. The main idea
+  // with this control is to allow a small number of unswitches to happen
+  // and rely more on siblings multiplier (see below) when the number
+  // of candidates is small.
+  unsigned ClonesPower =
+      std::max(UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, 0);
+
+  // Allowing top-level loops to spread a bit more than nested ones.
+  int SiblingsMultiplier =
+      std::max((ParentL ? SiblingsCount
+                        : SiblingsCount / (int)UnswitchSiblingsToplevelDiv),
+               1);
+  // Compute the cost multiplier in a way that won't overflow by saturating
+  // at an upper bound.
+  int CostMultiplier;
+  if (ClonesPower > Log2_32(UnswitchThreshold) ||
+      SiblingsMultiplier > UnswitchThreshold)
+    CostMultiplier = UnswitchThreshold;
+  else
+    CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
+                              (int)UnswitchThreshold);
+
+  LLVM_DEBUG(dbgs() << "  Computed multiplier  " << CostMultiplier
+                    << " (siblings " << SiblingsMultiplier << " * clones "
+                    << (1 << ClonesPower) << ")"
+                    << " for unswitch candidate: " << TI << "\n");
+  return CostMultiplier;
+}
+
 static bool
 unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
                       AssumptionCache &AC, TargetTransformInfo &TTI,
@@ -2473,8 +2572,23 @@ unswitchBestCondition(Loop &L, Dominator
     int CandidateCost = ComputeUnswitchedCost(
         TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 &&
                                      Invariants[0] == BI->getCondition()));
-    LLVM_DEBUG(dbgs() << "  Computed cost of " << CandidateCost
-                      << " for unswitch candidate: " << TI << "\n");
+    // Calculate cost multiplier which is a tool to limit potentially
+    // exponential behavior of loop-unswitch.
+    if (EnableUnswitchCostMultiplier) {
+      int CostMultiplier =
+          calculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates);
+      assert(
+          (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) &&
+          "cost multiplier needs to be in the range of 1..UnswitchThreshold");
+      CandidateCost *= CostMultiplier;
+      LLVM_DEBUG(dbgs() << "  Computed cost of " << CandidateCost
+                        << " (multiplier: " << CostMultiplier << ")"
+                        << " for unswitch candidate: " << TI << "\n");
+    } else {
+      LLVM_DEBUG(dbgs() << "  Computed cost of " << CandidateCost
+                        << " for unswitch candidate: " << TI << "\n");
+    }
+
     if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
       BestUnswitchTI = &TI;
       BestUnswitchCost = CandidateCost;

Added: llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll?rev=347097&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll (added)
+++ llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll Fri Nov 16 13:16:43 2018
@@ -0,0 +1,139 @@
+;
+; There should be just a single copy of each loop when strictest mutiplier
+; candidates formula (unscaled candidates == 0) is enforced:
+
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=16 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+;
+; When we relax the candidates part of a multiplier formula
+; (unscaled candidates == 4) we start getting  some unswitches,
+; which leads to siblings multiplier kicking in.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=4 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b | FileCheck %s --check-prefixes=LOOP-UNSCALE4-DIV1
+;
+; NB: sort -b is essential here and below, otherwise blanks might lead to different
+; order depending on locale.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=4 -unswitch-siblings-toplevel-div=2 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b | FileCheck %s --check-prefixes=LOOP-UNSCALE4-DIV2
+;
+;
+; Get
+;    2^(num conds) == 2^5 = 32
+; loop nests when cost multiplier is disabled:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:	   sort -b | FileCheck %s --check-prefixes=LOOP32
+;
+; Single loop nest, not unswitched
+; LOOP1:     Loop at depth 1 containing:
+; LOOP1:     Loop at depth 2 containing:
+; LOOP1:     Loop at depth 3 containing:
+; LOOP1-NOT: Loop at depth {{[0-9]+}} containing:
+;
+; Half unswitched loop nests, with unscaled4 and div1 it gets less depth1 loops unswitched
+; since they have more cost.
+; LOOP-UNSCALE4-DIV1-COUNT-6: Loop at depth 1 containing:
+; LOOP-UNSCALE4-DIV1-COUNT-19: Loop at depth 2 containing:
+; LOOP-UNSCALE4-DIV1-COUNT-29: Loop at depth 3 containing:
+; LOOP-UNSCALE4-DIV1-NOT:      Loop at depth {{[0-9]+}} containing:
+;
+; Half unswitched loop nests, with unscaled4 and div2 it gets more depth1 loops unswitched
+; as div2 kicks in.
+; LOOP-UNSCALE4-DIV2-COUNT-11: Loop at depth 1 containing:
+; LOOP-UNSCALE4-DIV2-COUNT-22: Loop at depth 2 containing:
+; LOOP-UNSCALE4-DIV2-COUNT-29: Loop at depth 3 containing:
+; LOOP-UNSCALE4-DIV2-NOT:      Loop at depth {{[0-9]+}} containing:
+;
+; 32 loop nests, fully unswitched
+; LOOP32-COUNT-32: Loop at depth 1 containing:
+; LOOP32-COUNT-32: Loop at depth 2 containing:
+; LOOP32-COUNT-32: Loop at depth 3 containing:
+; LOOP32-NOT:      Loop at depth {{[0-9]+}} containing:
+
+declare void @bar()
+
+define void @loop_nested3_conds5(i32* %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
+entry:
+  %addr1 = getelementptr i32, i32* %addr, i64 0
+  %addr2 = getelementptr i32, i32* %addr, i64 1
+  %addr3 = getelementptr i32, i32* %addr, i64 2
+  br label %outer
+outer:
+  %iv1 = phi i32 [0, %entry], [%iv1.next, %outer_latch]
+  %iv1.next = add i32 %iv1, 1
+  ;; skip nontrivial unswitch
+  call void @bar()
+  br label %middle
+middle:
+  %iv2 = phi i32 [0, %outer], [%iv2.next, %middle_latch]
+  %iv2.next = add i32 %iv2, 1
+  ;; skip nontrivial unswitch
+  call void @bar()
+  br label %loop
+loop:
+  %iv3 = phi i32 [0, %middle], [%iv3.next, %loop_latch]
+  %iv3.next = add i32 %iv3, 1
+  ;; skip nontrivial unswitch
+  call void @bar()
+  br i1 %c1, label %loop_next1_left, label %loop_next1_right
+loop_next1_left:
+  br label %loop_next1
+loop_next1_right:
+  br label %loop_next1
+
+loop_next1:
+  br i1 %c2, label %loop_next2_left, label %loop_next2_right
+loop_next2_left:
+  br label %loop_next2
+loop_next2_right:
+  br label %loop_next2
+
+loop_next2:
+  br i1 %c3, label %loop_next3_left, label %loop_next3_right
+loop_next3_left:
+  br label %loop_next3
+loop_next3_right:
+  br label %loop_next3
+
+loop_next3:
+  br i1 %c4, label %loop_next4_left, label %loop_next4_right
+loop_next4_left:
+  br label %loop_next4
+loop_next4_right:
+  br label %loop_next4
+
+loop_next4:
+  br i1 %c5, label %loop_latch_left, label %loop_latch_right
+loop_latch_left:
+  br label %loop_latch
+loop_latch_right:
+  br label %loop_latch
+
+loop_latch:
+  store volatile i32 0, i32* %addr1
+  %test_loop = icmp slt i32 %iv3, 50
+  br i1 %test_loop, label %loop, label %middle_latch
+middle_latch:
+  store volatile i32 0, i32* %addr2
+  %test_middle = icmp slt i32 %iv2, 50
+  br i1 %test_middle, label %middle, label %outer_latch
+outer_latch:
+  store volatile i32 0, i32* %addr3
+  %test_outer = icmp slt i32 %iv1, 50
+  br i1 %test_outer, label %outer, label %exit
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll?rev=347097&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll (added)
+++ llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll Fri Nov 16 13:16:43 2018
@@ -0,0 +1,149 @@
+;
+; Here all the branches we unswitch are exiting from the inner loop.
+; That means we should not be getting exponential behavior on inner-loop
+; unswitch. In fact there should be just a single version of inner-loop,
+; with possibly some outer loop copies.
+;
+; There should be just a single copy of each loop when strictest mutiplier
+; candidates formula (unscaled candidates == 0) is enforced:
+
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=16 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+;
+; When we relax the candidates part of a multiplier formula
+; (unscaled candidates == 2) we start getting some unswitches in outer loops,
+; which leads to siblings multiplier kicking in.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=3 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b | FileCheck %s --check-prefixes=LOOP-UNSCALE3-DIV1
+;
+; NB: sort -b is essential here and below, otherwise blanks might lead to different
+; order depending on locale.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=3 -unswitch-siblings-toplevel-div=2 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b | FileCheck %s --check-prefixes=LOOP-UNSCALE3-DIV2
+;
+; With disabled cost-multiplier we get maximal possible amount of unswitches.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:	   sort -b | FileCheck %s --check-prefixes=LOOP-MAX
+;
+; Single loop nest, not unswitched
+; LOOP1:     Loop at depth 1 containing:
+; LOOP1-NOT:  Loop at depth 1 containing:
+; LOOP1:     Loop at depth 2 containing:
+; LOOP1-NOT:  Loop at depth 2 containing:
+; LOOP1:     Loop at depth 3 containing:
+; LOOP1-NOT:  Loop at depth 3 containing:
+;
+; Half unswitched loop nests, with unscaled3 and div1 it gets less depth1 loops unswitched
+; since they have more cost.
+; LOOP-UNSCALE3-DIV1-COUNT-4: Loop at depth 1 containing:
+; LOOP-UNSCALE3-DIV1-NOT:      Loop at depth 1 containing:
+; LOOP-UNSCALE3-DIV1-COUNT-1: Loop at depth 2 containing:
+; LOOP-UNSCALE3-DIV1-NOT:      Loop at depth 2 containing:
+; LOOP-UNSCALE3-DIV1-COUNT-1: Loop at depth 3 containing:
+; LOOP-UNSCALE3-DIV1-NOT:      Loop at depth 3 containing:
+;
+; Half unswitched loop nests, with unscaled3 and div2 it gets more depth1 loops unswitched
+; as div2 kicks in.
+; LOOP-UNSCALE3-DIV2-COUNT-6: Loop at depth 1 containing:
+; LOOP-UNSCALE3-DIV2-NOT:      Loop at depth 1 containing:
+; LOOP-UNSCALE3-DIV2-COUNT-1: Loop at depth 2 containing:
+; LOOP-UNSCALE3-DIV2-NOT:      Loop at depth 2 containing:
+; LOOP-UNSCALE3-DIV2-COUNT-1: Loop at depth 3 containing:
+; LOOP-UNSCALE3-DIV2-NOT:      Loop at depth 3 containing:
+;
+; Maximally unswitched (copy of the outer loop per each condition)
+; LOOP-MAX-COUNT-6: Loop at depth 1 containing:
+; LOOP-MAX-NOT:      Loop at depth 1 containing:
+; LOOP-MAX-COUNT-1: Loop at depth 2 containing:
+; LOOP-MAX-NOT:      Loop at depth 2 containing:
+; LOOP-MAX-COUNT-1: Loop at depth 3 containing:
+; LOOP-MAX-NOT:      Loop at depth 3 containing:
+
+declare void @bar()
+
+define void @loop_nested3_conds5(i32* %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
+entry:
+  %addr1 = getelementptr i32, i32* %addr, i64 0
+  %addr2 = getelementptr i32, i32* %addr, i64 1
+  %addr3 = getelementptr i32, i32* %addr, i64 2
+  br label %outer
+outer:
+  %iv1 = phi i32 [0, %entry], [%iv1.next, %outer_latch]
+  %iv1.next = add i32 %iv1, 1
+  ;; skip nontrivial unswitch
+  call void @bar()
+  br label %middle
+middle:
+  %iv2 = phi i32 [0, %outer], [%iv2.next, %middle_latch]
+  %iv2.next = add i32 %iv2, 1
+  ;; skip nontrivial unswitch
+  call void @bar()
+  br label %loop
+loop:
+  %iv3 = phi i32 [0, %middle], [%iv3.next, %loop_latch]
+  %iv3.next = add i32 %iv3, 1
+  ;; skip nontrivial unswitch
+  call void @bar()
+  br i1 %c1, label %loop_next1_left, label %outer_latch
+loop_next1_left:
+  br label %loop_next1
+loop_next1_right:
+  br label %loop_next1
+
+loop_next1:
+  br i1 %c2, label %loop_next2_left, label %outer_latch
+loop_next2_left:
+  br label %loop_next2
+loop_next2_right:
+  br label %loop_next2
+
+loop_next2:
+  br i1 %c3, label %loop_next3_left, label %outer_latch
+loop_next3_left:
+  br label %loop_next3
+loop_next3_right:
+  br label %loop_next3
+
+loop_next3:
+  br i1 %c4, label %loop_next4_left, label %outer_latch
+loop_next4_left:
+  br label %loop_next4
+loop_next4_right:
+  br label %loop_next4
+
+loop_next4:
+  br i1 %c5, label %loop_latch_left, label %outer_latch
+loop_latch_left:
+  br label %loop_latch
+loop_latch_right:
+  br label %loop_latch
+
+loop_latch:
+  store volatile i32 0, i32* %addr1
+  %test_loop = icmp slt i32 %iv3, 50
+  br i1 %test_loop, label %loop, label %middle_latch
+middle_latch:
+  store volatile i32 0, i32* %addr2
+  %test_middle = icmp slt i32 %iv2, 50
+  br i1 %test_middle, label %middle, label %outer_latch
+outer_latch:
+  store volatile i32 0, i32* %addr3
+  %test_outer = icmp slt i32 %iv1, 50
+  br i1 %test_outer, label %outer, label %exit
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll?rev=347097&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll (added)
+++ llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll Fri Nov 16 13:16:43 2018
@@ -0,0 +1,80 @@
+;
+; There should be just a single copy of loop when strictest mutiplier candidates
+; formula (unscaled candidates == 0) is enforced:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=8 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; With relaxed candidates multiplier (unscaled candidates == 8) we should allow
+; some unswitches to happen until siblings multiplier starts kicking in:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP5
+;
+; With relaxed candidates multiplier (unscaled candidates == 8) and with relaxed
+; siblings multiplier for top-level loops (toplevel-div == 8) we should get
+;    2^(num conds) == 2^5 == 32
+; copies of the loop:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32
+;
+; Similarly get
+;    2^(num conds) == 2^5 == 32
+; copies of the loop when cost multiplier is disabled:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32
+;
+;
+; Single loop, not unswitched
+; LOOP1:     Loop at depth 1 containing:
+; LOOP1-NOT: Loop at depth 1 containing:
+
+; 5 loops, unswitched 4 times
+; LOOP5-COUNT-5: Loop at depth 1 containing:
+; LOOP5-NOT:     Loop at depth 1 containing:
+
+; 32 loops, fully unswitched
+; LOOP32-COUNT-32: Loop at depth 1 containing:
+; LOOP32-NOT:     Loop at depth 1 containing:
+
+define void @loop_simple5(i32* %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
+entry:
+  br label %loop
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop_latch]
+  %iv.next = add i32 %iv, 1
+  br i1 %c1, label %loop_next1, label %loop_next1_right
+loop_next1_right:
+  br label %loop_next1
+loop_next1:
+  br i1 %c2, label %loop_next2, label %loop_next2_right
+loop_next2_right:
+  br label %loop_next2
+loop_next2:
+  br i1 %c3, label %loop_next3, label %loop_next3_right
+loop_next3_right:
+  br label %loop_next3
+loop_next3:
+  br i1 %c4, label %loop_next4, label %loop_next4_right
+loop_next4_right:
+  br label %loop_next4
+loop_next4:
+  br i1 %c5, label %loop_latch, label %loop_latch_right
+loop_latch_right:
+  br label %loop_latch
+loop_latch:
+  store volatile i32 0, i32* %addr
+  %test_loop = icmp slt i32 %iv, 50
+  br i1 %test_loop, label %loop, label %exit
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch2.ll?rev=347097&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch2.ll (added)
+++ llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch2.ll Fri Nov 16 13:16:43 2018
@@ -0,0 +1,56 @@
+;
+; Here all the branches are exiting ones. Checking that we dont have
+; exponential behavior with any kind of controlling heuristics here.
+;
+; There we should have just a single loop.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=8 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+;
+; Single loop, not unswitched
+; LOOP1:     Loop at depth 1 containing:
+; LOOP1-NOT: Loop at depth 1 containing:
+
+declare void @bar()
+
+define void @loop_simple5(i32* %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
+entry:
+  br label %loop
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %loop_latch]
+  %iv.next = add i32 %iv, 1
+  ;; disabling trivial unswitch
+  call void @bar()
+  br i1 %c1, label %loop_next1, label %exit
+loop_next1:
+  br i1 %c2, label %loop_next2, label %exit
+loop_next2:
+  br i1 %c3, label %loop_next3, label %exit
+loop_next3:
+  br i1 %c4, label %loop_next4, label %exit
+loop_next4:
+  br i1 %c5, label %loop_latch, label %exit
+loop_latch:
+  store volatile i32 0, i32* %addr
+  %test_loop = icmp slt i32 %iv, 50
+  br i1 %test_loop, label %loop, label %exit
+exit:
+  ret void
+}

Added: llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll?rev=347097&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll (added)
+++ llvm/trunk/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll Fri Nov 16 13:16:43 2018
@@ -0,0 +1,118 @@
+;
+; Here we have 5-way unswitchable switch with each successor also having an unswitchable
+; exiting branch in it. If we start unswitching those branches we start duplicating the
+; whole switch. This can easily lead to exponential behavior w/o proper control.
+; On a real-life testcase there was 16-way switch and that took forever to compile w/o
+; a cost control.
+;
+;
+; When we use the stricted multiplier candidates formula (unscaled candidates == 0)
+; we should be getting just a single loop.
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=16 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
+;
+;
+; With relaxed candidates multiplier (unscaled candidates == 8) we should allow
+; some unswitches to happen until siblings multiplier starts kicking in:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b | FileCheck %s --check-prefixes=LOOP-RELAX
+;
+; With relaxed candidates multiplier (unscaled candidates == 8) and with relaxed
+; siblings multiplier for top-level loops (toplevel-div == 8) we should get
+; considerably more copies of the loop (especially top-level ones).
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
+; RUN:     -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b | FileCheck %s --check-prefixes=LOOP-RELAX2
+;
+; We get hundreds of copies of the loop when cost multiplier is disabled:
+;
+; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
+; RUN:     -passes='loop(unswitch),print<loops>' -disable-output 2>&1 | \
+; RUN:     sort -b | FileCheck %s --check-prefixes=LOOP-MAX
+;
+
+; Single loop nest, not unswitched
+; LOOP1:     Loop at depth 1 containing:
+; LOOP1-NOT: Loop at depth 1 containing:
+; LOOP1:     Loop at depth 2 containing:
+; LOOP1-NOT: Loop at depth 2 containing:
+;
+; Somewhat relaxed restrictions on candidates:
+; LOOP-RELAX-COUNT-5:     Loop at depth 1 containing:
+; LOOP-RELAX-NOT: Loop at depth 1 containing:
+; LOOP-RELAX-COUNT-32:     Loop at depth 2 containing:
+; LOOP-RELAX-NOT: Loop at depth 2 containing:
+;
+; Even more relaxed restrictions on candidates and siblings.
+; LOOP-RELAX2-COUNT-11:     Loop at depth 1 containing:
+; LOOP-RELAX2-NOT: Loop at depth 1 containing:
+; LOOP-RELAX2-COUNT-40:     Loop at depth 2 containing:
+; LOOP-RELAX-NOT: Loop at depth 2 containing:
+;
+; Unswitched as much as it could (with multiplier disabled).
+; LOOP-MAX-COUNT-56:     Loop at depth 1 containing:
+; LOOP-MAX-NOT: Loop at depth 1 containing:
+; LOOP-MAX-COUNT-111:     Loop at depth 2 containing:
+; LOOP-MAX-NOT: Loop at depth 2 containing:
+
+define i32 @loop_switch(i32* %addr, i32 %c1, i32 %c2) {
+entry:
+  %addr1 = getelementptr i32, i32* %addr, i64 0
+  %addr2 = getelementptr i32, i32* %addr, i64 1
+  %check0 = icmp eq i32 %c2, 0
+  %check1 = icmp eq i32 %c2, 31
+  %check2 = icmp eq i32 %c2, 32
+  %check3 = icmp eq i32 %c2, 33
+  %check4 = icmp eq i32 %c2, 34
+  br label %outer_loop
+
+outer_loop:
+  %iv1 = phi i32 [0, %entry], [%iv1.next, %outer_latch]
+  %iv1.next = add i32 %iv1, 1
+  br label %inner_loop
+inner_loop:
+  %iv2 = phi i32 [0, %outer_loop], [%iv2.next, %inner_latch]
+  %iv2.next = add i32 %iv2, 1
+  switch i32 %c1, label %inner_latch [
+    i32 0, label %case0
+    i32 1, label %case1
+    i32 2, label %case2
+    i32 3, label %case3
+    i32 4, label %case4
+  ]
+
+case4:
+  br i1 %check4, label %exit, label %inner_latch
+case3:
+  br i1 %check3, label %exit, label %inner_latch
+case2:
+  br i1 %check2, label %exit, label %inner_latch
+case1:
+  br i1 %check1, label %exit, label %inner_latch
+case0:
+  br i1 %check0, label %exit, label %inner_latch
+
+inner_latch:
+  store volatile i32 0, i32* %addr1
+  %test_inner = icmp slt i32 %iv2, 50
+  br i1 %test_inner, label %inner_loop, label %outer_latch
+
+outer_latch:
+  store volatile i32 0, i32* %addr2
+  %test_outer = icmp slt i32 %iv1, 50
+  br i1 %test_outer, label %outer_loop, label %exit
+
+exit:                                            ; preds = %bci_0
+  ret i32 1
+}