[llvm] 0aeb373 - [SimpleLoopUnswitch] Re-fix introduction of UB when hoisted condition may be undef or poison

hyeongyu kim via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 11 09:02:16 PDT 2021


Author: hyeongyu kim
Date: 2021-10-12T01:02:09+09:00
New Revision: 0aeb37324dbb83d442b9222f465cece691fe29e0

URL: https://github.com/llvm/llvm-project/commit/0aeb37324dbb83d442b9222f465cece691fe29e0
DIFF: https://github.com/llvm/llvm-project/commit/0aeb37324dbb83d442b9222f465cece691fe29e0.diff

LOG: [SimpleLoopUnswitch] Re-fix introduction of UB when hoisted condition may be undef or poison

https://bugs.llvm.org/show_bug.cgi?id=27506
https://bugs.llvm.org/show_bug.cgi?id=31652
https://bugs.llvm.org/show_bug.cgi?id=51043

Problems with SimpleLoopUnswitch cause the bug reports above.

```
while (...) {
  if (C) { A }
  else   { B }
}
Into:

C' = freeze(C)
if (C') {
  while (...) { A }
} else {
  while (...) { B }
}
```
This problem can be solved by adding a freeze on hoisted branches(above transform) and has been solved by D29015.
However, D29015 is now reverted by performance regression(https://github.com/llvm/llvm-project/commit/2b5a8976514de326bb84f0913d9d451089c11d22)

It is not the first time that an added freeze has caused performance regression.
SimplifyCFG also had a problem with UB caused by branching-on-undef, which was solved by adding freeze to the branching condition. (D104569)
Performance regression occurred in D104569, and patches such as D105344 and D105392 were written to minimize it.

This patch will correct the SimpleLoopUnswitch as D104569 handles the SimplyCFG while minimizing performance loss by introducing patches like D105344 and D105392(This patch was rebased with the author's permission)

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D106041

Added: 
    llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll

Modified: 
    llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index c768f35eda5a2..9752f521bb241 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -109,6 +110,10 @@ static cl::opt<unsigned>
                   cl::desc("Max number of memory uses to explore during "
                            "partial unswitching analysis"),
                   cl::init(100), cl::Hidden);
+static cl::opt<bool> FreezeLoopUnswitchCond(
+    "freeze-loop-unswitch-cond", cl::init(false), cl::Hidden,
+    cl::desc("If enabled, the freeze instruction will be added to condition "
+             "of loop unswitch to prevent miscompilation."));
 
 /// Collect all of the loop invariant input values transitively used by the
 /// homogeneous instruction graph from a given root.
@@ -196,15 +201,15 @@ static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
 /// Copy a set of loop invariant values \p ToDuplicate and insert them at the
 /// end of \p BB and conditionally branch on the copied condition. We only
 /// branch on a single value.
-static void buildPartialUnswitchConditionalBranch(BasicBlock &BB,
-                                                  ArrayRef<Value *> Invariants,
-                                                  bool Direction,
-                                                  BasicBlock &UnswitchedSucc,
-                                                  BasicBlock &NormalSucc) {
+static void buildPartialUnswitchConditionalBranch(
+    BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction,
+    BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze) {
   IRBuilder<> IRB(&BB);
 
   Value *Cond = Direction ? IRB.CreateOr(Invariants) :
     IRB.CreateAnd(Invariants);
+  if (InsertFreeze)
+    Cond = IRB.CreateFreeze(Cond, Cond->getName() + ".fr");
   IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
                    Direction ? &NormalSucc : &UnswitchedSucc);
 }
@@ -565,7 +570,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
              "Must have an `and` of `i1`s or `select i1 X, Y, false`s for the"
              " condition!");
     buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection,
-                                          *UnswitchedBB, *NewPH);
+                                          *UnswitchedBB, *NewPH, false);
   }
 
   // Update the dominator tree with the added edge.
@@ -2124,6 +2129,13 @@ static void unswitchNontrivialInvariants(
       SE->forgetTopmostLoop(&L);
   }
 
+  bool InsertFreeze = false;
+  if (FreezeLoopUnswitchCond) {
+    ICFLoopSafetyInfo SafetyInfo;
+    SafetyInfo.computeLoopSafetyInfo(&L);
+    InsertFreeze = !SafetyInfo.isGuaranteedToExecute(TI, &DT, &L);
+  }
+
   // If the edge from this terminator to a successor dominates that successor,
   // store a map from each block in its dominator subtree to it. This lets us
   // tell when cloning for a particular successor if a block is dominated by
@@ -2198,6 +2210,11 @@ static void unswitchNontrivialInvariants(
       BasicBlock *ClonedPH = ClonedPHs.begin()->second;
       BI->setSuccessor(ClonedSucc, ClonedPH);
       BI->setSuccessor(1 - ClonedSucc, LoopPH);
+      if (InsertFreeze) {
+        auto Cond = BI->getCondition();
+        if (!isGuaranteedNotToBeUndefOrPoison(Cond, &AC, BI, &DT))
+          BI->setCondition(new FreezeInst(Cond, Cond->getName() + ".fr", BI));
+      }
       DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
     } else {
       assert(SI && "Must either be a branch or switch!");
@@ -2212,6 +2229,11 @@ static void unswitchNontrivialInvariants(
         else
           Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second);
 
+      if (InsertFreeze) {
+        auto Cond = SI->getCondition();
+        if (!isGuaranteedNotToBeUndefOrPoison(Cond, &AC, SI, &DT))
+          SI->setCondition(new FreezeInst(Cond, Cond->getName() + ".fr", SI));
+      }
       // We need to use the set to populate domtree updates as even when there
       // are multiple cases pointing at the same successor we only want to
       // remove and insert one edge in the domtree.
@@ -2292,7 +2314,7 @@ static void unswitchNontrivialInvariants(
           *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU);
     else
       buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
-                                            *ClonedPH, *LoopPH);
+                                            *ClonedPH, *LoopPH, InsertFreeze);
     DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
 
     if (MSSAU) {

diff  --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll
new file mode 100644
index 0000000000000..f8b5661f8b459
--- /dev/null
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll
@@ -0,0 +1,2330 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -freeze-loop-unswitch-cond -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s
+; RUN: opt -freeze-loop-unswitch-cond -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s
+; RUN: opt -freeze-loop-unswitch-cond -simple-loop-unswitch -enable-nontrivial-unswitch -verify-memoryssa -S < %s | FileCheck %s
+
+declare i32 @a()
+declare i32 @b()
+declare i32 @c()
+declare i32 @d()
+
+declare void @sink1(i32)
+declare void @sink2(i32)
+declare void @sink3(i1)
+declare void @sink4(i1)
+
+declare i1 @cond()
+declare i32 @cond.i32()
+
+declare i32 @__CxxFrameHandler3(...)
+
+define i32 @test1_freeze(i1* %ptr0, i1* %ptr1, i1* %ptr2) {
+; CHECK-LABEL: @test1_freeze(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND1:%.*]] = load i1, i1* [[PTR1:%.*]], align 1
+; CHECK-NEXT:    [[COND2:%.*]] = load i1, i1* [[PTR2:%.*]], align 1
+; CHECK-NEXT:    br i1 [[COND1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_BEGIN_US:%.*]]
+; CHECK:       loop_begin.us:
+; CHECK-NEXT:    br label [[LOOP_A_US:%.*]]
+; CHECK:       loop_a.us:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @a()
+; CHECK-NEXT:    br label [[LATCH_US:%.*]]
+; CHECK:       latch.us:
+; CHECK-NEXT:    [[V_US:%.*]] = load i1, i1* [[PTR0:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V_US]], label [[LOOP_BEGIN_US]], label [[LOOP_EXIT_SPLIT_US:%.*]]
+; CHECK:       loop_exit.split.us:
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    [[COND2_FR:%.*]] = freeze i1 [[COND2]]
+; CHECK-NEXT:    br i1 [[COND2_FR]], label [[ENTRY_SPLIT_SPLIT_US:%.*]], label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split.us:
+; CHECK-NEXT:    br label [[LOOP_BEGIN_US1:%.*]]
+; CHECK:       loop_begin.us1:
+; CHECK-NEXT:    br label [[LOOP_B_US:%.*]]
+; CHECK:       loop_b.us:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @b()
+; CHECK-NEXT:    br label [[LOOP_B_A_US:%.*]]
+; CHECK:       loop_b_a.us:
+; CHECK-NEXT:    call void @sink3(i1 true)
+; CHECK-NEXT:    br label [[LATCH_US2:%.*]]
+; CHECK:       latch.us2:
+; CHECK-NEXT:    [[V_US3:%.*]] = load i1, i1* [[PTR0]], align 1
+; CHECK-NEXT:    br i1 [[V_US3]], label [[LOOP_BEGIN_US1]], label [[LOOP_EXIT_SPLIT_SPLIT_US:%.*]]
+; CHECK:       loop_exit.split.split.us:
+; CHECK-NEXT:    br label [[LOOP_EXIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    br label [[LOOP_B:%.*]]
+; CHECK:       loop_b:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @b()
+; CHECK-NEXT:    br label [[LOOP_B_B:%.*]]
+; CHECK:       loop_b_b:
+; CHECK-NEXT:    call void @sink4(i1 false)
+; CHECK-NEXT:    br label [[LATCH:%.*]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[V:%.*]] = load i1, i1* [[PTR0]], align 1
+; CHECK-NEXT:    br i1 [[V]], label [[LOOP_BEGIN]], label [[LOOP_EXIT_SPLIT_SPLIT:%.*]]
+; CHECK:       loop_exit.split.split:
+; CHECK-NEXT:    br label [[LOOP_EXIT_SPLIT]]
+; CHECK:       loop_exit.split:
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %cond1 = load i1, i1* %ptr1
+  %cond2 = load i1, i1* %ptr2
+  br label %loop_begin
+
+loop_begin:
+  br i1 %cond1, label %loop_a, label %loop_b
+
+loop_a:
+  call i32 @a()
+  br label %latch
+; The 'loop_a' unswitched loop.
+
+loop_b:
+  call i32 @b()
+  br i1 %cond2, label %loop_b_a, label %loop_b_b
+; The second unswitched condition.
+
+loop_b_a:
+  call void @sink3(i1 %cond2)
+  br label %latch
+; The 'loop_b_a' unswitched loop.
+; %cond2 is replaced to true
+
+loop_b_b:
+  call void @sink4(i1 %cond2)
+  br label %latch
+; The 'loop_b_b' unswitched loop.
+; %cond2 is replaced to false
+
+latch:
+  %v = load i1, i1* %ptr0
+  br i1 %v, label %loop_begin, label %loop_exit
+
+loop_exit:
+  ret i32 0
+}
+
+; Test that when unswitching a deeply nested loop condition in a way that
+; produces a non-loop clone that can reach multiple exit blocks which are part
+; of 
diff erent outer loops we correctly divide the cloned loop blocks between
+; the outer loops based on reachability.
+define i32 @test7a(i1* %ptr, i1* %cond.ptr, i32* %a.ptr, i32* %b.ptr) {
+; CHECK-LABEL: @test7a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_loop_begin:
+; CHECK-NEXT:    [[A_PHI:%.*]] = phi i32 [ [[A]], [[LOOP_BEGIN]] ], [ [[A2:%.*]], [[INNER_INNER_LOOP_EXIT:%.*]] ]
+; CHECK-NEXT:    [[COND:%.*]] = load i1, i1* [[COND_PTR:%.*]], align 1
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_PTR:%.*]], align 4
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[INNER_LOOP_BEGIN_SPLIT_US:%.*]], label [[INNER_LOOP_BEGIN_SPLIT:%.*]]
+; CHECK:       inner_loop_begin.split.us:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN_US:%.*]]
+; CHECK:       inner_inner_loop_begin.us:
+; CHECK-NEXT:    [[V1_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V1_US]], label [[INNER_INNER_LOOP_A_US:%.*]], label [[INNER_INNER_LOOP_B_US:%.*]]
+; CHECK:       inner_inner_loop_b.us:
+; CHECK-NEXT:    [[V3_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V3_US]], label [[INNER_INNER_LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_C_US_LOOPEXIT:%.*]]
+; CHECK:       inner_inner_loop_a.us:
+; CHECK-NEXT:    [[A_PHI_LCSSA10:%.*]] = phi i32 [ [[A_PHI]], [[INNER_INNER_LOOP_BEGIN_US]] ]
+; CHECK-NEXT:    [[B_LCSSA6:%.*]] = phi i32 [ [[B]], [[INNER_INNER_LOOP_BEGIN_US]] ]
+; CHECK-NEXT:    [[V2_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2_US]], label [[LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_C_US:%.*]]
+; CHECK:       inner_inner_loop_c.us.loopexit:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_C_US]]
+; CHECK:       inner_inner_loop_c.us:
+; CHECK-NEXT:    [[V4_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V4_US]], label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_D_US:%.*]]
+; CHECK:       inner_inner_loop_d.us:
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT_US]]
+; CHECK:       inner_inner_loop_exit.split.us:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_EXIT]]
+; CHECK:       loop_exit.split.us:
+; CHECK-NEXT:    [[A_LCSSA_US:%.*]] = phi i32 [ [[A_PHI_LCSSA10]], [[INNER_INNER_LOOP_A_US]] ]
+; CHECK-NEXT:    [[B_LCSSA_US:%.*]] = phi i32 [ [[B_LCSSA6]], [[INNER_INNER_LOOP_A_US]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
+; CHECK:       inner_loop_exit.loopexit.split.us:
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT:%.*]]
+; CHECK:       inner_loop_begin.split:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_inner_loop_begin:
+; CHECK-NEXT:    [[V1:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V1]], label [[INNER_INNER_LOOP_A:%.*]], label [[INNER_INNER_LOOP_B:%.*]]
+; CHECK:       inner_inner_loop_a:
+; CHECK-NEXT:    [[V2:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2]], label [[LOOP_EXIT_SPLIT:%.*]], label [[INNER_INNER_LOOP_C:%.*]]
+; CHECK:       inner_inner_loop_b:
+; CHECK-NEXT:    [[V3:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V3]], label [[INNER_INNER_LOOP_EXIT_SPLIT:%.*]], label [[INNER_INNER_LOOP_C]]
+; CHECK:       inner_inner_loop_c:
+; CHECK-NEXT:    [[V4:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V4]], label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT:%.*]], label [[INNER_INNER_LOOP_D:%.*]]
+; CHECK:       inner_inner_loop_d:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN]]
+; CHECK:       inner_inner_loop_exit.split:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_EXIT]]
+; CHECK:       inner_inner_loop_exit:
+; CHECK-NEXT:    [[A2]] = load i32, i32* [[A_PTR]], align 4
+; CHECK-NEXT:    [[V5:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V5]], label [[INNER_LOOP_EXIT_LOOPEXIT1:%.*]], label [[INNER_LOOP_BEGIN]]
+; CHECK:       inner_loop_exit.loopexit.split:
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT]]
+; CHECK:       inner_loop_exit.loopexit:
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT:%.*]]
+; CHECK:       inner_loop_exit.loopexit1:
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT]]
+; CHECK:       inner_loop_exit:
+; CHECK-NEXT:    br label [[LOOP_BEGIN]]
+; CHECK:       loop_exit.split:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[A_PHI]], [[INNER_INNER_LOOP_A]] ]
+; CHECK-NEXT:    [[B_LCSSA:%.*]] = phi i32 [ [[B]], [[INNER_INNER_LOOP_A]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[DOTUS_PHI:%.*]] = phi i32 [ [[A_LCSSA]], [[LOOP_EXIT_SPLIT]] ], [ [[A_LCSSA_US]], [[LOOP_EXIT_SPLIT_US]] ]
+; CHECK-NEXT:    [[DOTUS_PHI2:%.*]] = phi i32 [ [[B_LCSSA]], [[LOOP_EXIT_SPLIT]] ], [ [[B_LCSSA_US]], [[LOOP_EXIT_SPLIT_US]] ]
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[DOTUS_PHI]], [[DOTUS_PHI2]]
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %a = load i32, i32* %a.ptr
+  br label %inner_loop_begin
+
+inner_loop_begin:
+  %a.phi = phi i32 [ %a, %loop_begin ], [ %a2, %inner_inner_loop_exit ]
+  %cond = load i1, i1* %cond.ptr
+  %b = load i32, i32* %b.ptr
+  br label %inner_inner_loop_begin
+
+inner_inner_loop_begin:
+  %v1 = load i1, i1* %ptr
+  br i1 %v1, label %inner_inner_loop_a, label %inner_inner_loop_b
+
+inner_inner_loop_a:
+  %v2 = load i1, i1* %ptr
+  br i1 %v2, label %loop_exit, label %inner_inner_loop_c
+
+inner_inner_loop_b:
+  %v3 = load i1, i1* %ptr
+  br i1 %v3, label %inner_inner_loop_exit, label %inner_inner_loop_c
+
+inner_inner_loop_c:
+  %v4 = load i1, i1* %ptr
+  br i1 %v4, label %inner_loop_exit, label %inner_inner_loop_d
+
+inner_inner_loop_d:
+  br i1 %cond, label %inner_loop_exit, label %inner_inner_loop_begin
+; The cloned copy that always exits with the adjustments required to fix up
+; loop exits.
+; The original copy that continues to loop.
+
+inner_inner_loop_exit:
+  %a2 = load i32, i32* %a.ptr
+  %v5 = load i1, i1* %ptr
+  br i1 %v5, label %inner_loop_exit, label %inner_loop_begin
+
+inner_loop_exit:
+  br label %loop_begin
+
+loop_exit:
+  %a.lcssa = phi i32 [ %a.phi, %inner_inner_loop_a ]
+  %b.lcssa = phi i32 [ %b, %inner_inner_loop_a ]
+  %result = add i32 %a.lcssa, %b.lcssa
+  ret i32 %result
+}
+
+; Same pattern as @test7a but here the original loop becomes a non-loop that
+; can reach multiple exit blocks which are part of 
diff erent outer loops.
+define i32 @test7b(i1* %ptr, i1* %cond.ptr, i32* %a.ptr, i32* %b.ptr) {
+; CHECK-LABEL: @test7b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_loop_begin:
+; CHECK-NEXT:    [[A_PHI:%.*]] = phi i32 [ [[A]], [[LOOP_BEGIN]] ], [ [[A2:%.*]], [[INNER_INNER_LOOP_EXIT:%.*]] ]
+; CHECK-NEXT:    [[COND:%.*]] = load i1, i1* [[COND_PTR:%.*]], align 1
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_PTR:%.*]], align 4
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[INNER_LOOP_BEGIN_SPLIT_US:%.*]], label [[INNER_LOOP_BEGIN_SPLIT:%.*]]
+; CHECK:       inner_loop_begin.split.us:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN_US:%.*]]
+; CHECK:       inner_inner_loop_begin.us:
+; CHECK-NEXT:    [[V1_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V1_US]], label [[INNER_INNER_LOOP_A_US:%.*]], label [[INNER_INNER_LOOP_B_US:%.*]]
+; CHECK:       inner_inner_loop_b.us:
+; CHECK-NEXT:    [[V3_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V3_US]], label [[INNER_INNER_LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_C_US:%.*]]
+; CHECK:       inner_inner_loop_a.us:
+; CHECK-NEXT:    [[V2_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2_US]], label [[LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_C_US]]
+; CHECK:       inner_inner_loop_c.us:
+; CHECK-NEXT:    [[V4_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V4_US]], label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_D_US:%.*]]
+; CHECK:       inner_inner_loop_d.us:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN_US]]
+; CHECK:       inner_inner_loop_exit.split.us:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_EXIT]]
+; CHECK:       loop_exit.split.us:
+; CHECK-NEXT:    [[A_LCSSA_US:%.*]] = phi i32 [ [[A_PHI]], [[INNER_INNER_LOOP_A_US]] ]
+; CHECK-NEXT:    [[B_LCSSA_US:%.*]] = phi i32 [ [[B]], [[INNER_INNER_LOOP_A_US]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
+; CHECK:       inner_loop_exit.loopexit.split.us:
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT:%.*]]
+; CHECK:       inner_loop_begin.split:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_inner_loop_begin:
+; CHECK-NEXT:    [[V1:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V1]], label [[INNER_INNER_LOOP_A:%.*]], label [[INNER_INNER_LOOP_B:%.*]]
+; CHECK:       inner_inner_loop_a:
+; CHECK-NEXT:    [[A_PHI_LCSSA:%.*]] = phi i32 [ [[A_PHI]], [[INNER_INNER_LOOP_BEGIN]] ]
+; CHECK-NEXT:    [[B_LCSSA3:%.*]] = phi i32 [ [[B]], [[INNER_INNER_LOOP_BEGIN]] ]
+; CHECK-NEXT:    [[V2:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2]], label [[LOOP_EXIT_SPLIT:%.*]], label [[INNER_INNER_LOOP_C:%.*]]
+; CHECK:       inner_inner_loop_b:
+; CHECK-NEXT:    [[V3:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V3]], label [[INNER_INNER_LOOP_EXIT_SPLIT:%.*]], label [[INNER_INNER_LOOP_C_LOOPEXIT:%.*]]
+; CHECK:       inner_inner_loop_c.loopexit:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_C]]
+; CHECK:       inner_inner_loop_c:
+; CHECK-NEXT:    [[V4:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V4]], label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT:%.*]], label [[INNER_INNER_LOOP_D:%.*]]
+; CHECK:       inner_inner_loop_d:
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT]]
+; CHECK:       inner_inner_loop_exit.split:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_EXIT]]
+; CHECK:       inner_inner_loop_exit:
+; CHECK-NEXT:    [[A2]] = load i32, i32* [[A_PTR]], align 4
+; CHECK-NEXT:    [[V5:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V5]], label [[INNER_LOOP_EXIT_LOOPEXIT1:%.*]], label [[INNER_LOOP_BEGIN]]
+; CHECK:       inner_loop_exit.loopexit.split:
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT]]
+; CHECK:       inner_loop_exit.loopexit:
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT:%.*]]
+; CHECK:       inner_loop_exit.loopexit1:
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT]]
+; CHECK:       inner_loop_exit:
+; CHECK-NEXT:    br label [[LOOP_BEGIN]]
+; CHECK:       loop_exit.split:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[A_PHI_LCSSA]], [[INNER_INNER_LOOP_A]] ]
+; CHECK-NEXT:    [[B_LCSSA:%.*]] = phi i32 [ [[B_LCSSA3]], [[INNER_INNER_LOOP_A]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[DOTUS_PHI:%.*]] = phi i32 [ [[A_LCSSA]], [[LOOP_EXIT_SPLIT]] ], [ [[A_LCSSA_US]], [[LOOP_EXIT_SPLIT_US]] ]
+; CHECK-NEXT:    [[DOTUS_PHI2:%.*]] = phi i32 [ [[B_LCSSA]], [[LOOP_EXIT_SPLIT]] ], [ [[B_LCSSA_US]], [[LOOP_EXIT_SPLIT_US]] ]
+; CHECK-NEXT:    [[RESULT:%.*]] = add i32 [[DOTUS_PHI]], [[DOTUS_PHI2]]
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %a = load i32, i32* %a.ptr
+  br label %inner_loop_begin
+
+inner_loop_begin:
+  %a.phi = phi i32 [ %a, %loop_begin ], [ %a2, %inner_inner_loop_exit ]
+  %cond = load i1, i1* %cond.ptr
+  %b = load i32, i32* %b.ptr
+  br label %inner_inner_loop_begin
+
+inner_inner_loop_begin:
+  %v1 = load i1, i1* %ptr
+  br i1 %v1, label %inner_inner_loop_a, label %inner_inner_loop_b
+
+inner_inner_loop_a:
+  %v2 = load i1, i1* %ptr
+  br i1 %v2, label %loop_exit, label %inner_inner_loop_c
+
+inner_inner_loop_b:
+  %v3 = load i1, i1* %ptr
+  br i1 %v3, label %inner_inner_loop_exit, label %inner_inner_loop_c
+
+inner_inner_loop_c:
+  %v4 = load i1, i1* %ptr
+  br i1 %v4, label %inner_loop_exit, label %inner_inner_loop_d
+
+inner_inner_loop_d:
+  br i1 %cond, label %inner_inner_loop_begin, label %inner_loop_exit
+; The cloned copy that continues looping.
+; The original copy that now always exits and needs adjustments for exit
+; blocks.
+
+inner_inner_loop_exit:
+  %a2 = load i32, i32* %a.ptr
+  %v5 = load i1, i1* %ptr
+  br i1 %v5, label %inner_loop_exit, label %inner_loop_begin
+
+inner_loop_exit:
+  br label %loop_begin
+
+loop_exit:
+  %a.lcssa = phi i32 [ %a.phi, %inner_inner_loop_a ]
+  %b.lcssa = phi i32 [ %b, %inner_inner_loop_a ]
+  %result = add i32 %a.lcssa, %b.lcssa
+  ret i32 %result
+}
+
+; Test that when the exit block set of an inner loop changes to start at a less
+; high level of the loop nest we correctly hoist the loop up the nest.
+define i32 @test8a(i1* %ptr, i1* %cond.ptr, i32* %a.ptr, i32* %b.ptr) {
+; CHECK-LABEL: @test8a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_loop_begin:
+; CHECK-NEXT:    [[A_PHI:%.*]] = phi i32 [ [[A]], [[LOOP_BEGIN]] ], [ [[A2:%.*]], [[INNER_INNER_LOOP_EXIT:%.*]] ]
+; CHECK-NEXT:    [[COND:%.*]] = load i1, i1* [[COND_PTR:%.*]], align 1
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_PTR:%.*]], align 4
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[INNER_LOOP_BEGIN_SPLIT_US:%.*]], label [[INNER_LOOP_BEGIN_SPLIT:%.*]]
+; CHECK:       inner_loop_begin.split.us:
+; CHECK-NEXT:    [[A_PHI_LCSSA4:%.*]] = phi i32 [ [[A_PHI]], [[INNER_LOOP_BEGIN]] ]
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN_US:%.*]]
+; CHECK:       inner_inner_loop_begin.us:
+; CHECK-NEXT:    [[V1_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V1_US]], label [[INNER_INNER_LOOP_A_US:%.*]], label [[INNER_INNER_LOOP_B_US:%.*]]
+; CHECK:       inner_inner_loop_b.us:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_LATCH_US:%.*]]
+; CHECK:       inner_inner_loop_a.us:
+; CHECK-NEXT:    [[V2_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2_US]], label [[INNER_INNER_LOOP_LATCH_US]], label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]]
+; CHECK:       inner_inner_loop_latch.us:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN_US]]
+; CHECK:       inner_loop_exit.loopexit.split.us:
+; CHECK-NEXT:    [[A_PHI_LCSSA2_US:%.*]] = phi i32 [ [[A_PHI_LCSSA4]], [[INNER_INNER_LOOP_A_US]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT:%.*]]
+; CHECK:       inner_loop_begin.split:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_inner_loop_begin:
+; CHECK-NEXT:    [[V1:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V1]], label [[INNER_INNER_LOOP_A:%.*]], label [[INNER_INNER_LOOP_B:%.*]]
+; CHECK:       inner_inner_loop_a:
+; CHECK-NEXT:    [[V2:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2]], label [[INNER_INNER_LOOP_LATCH:%.*]], label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT:%.*]]
+; CHECK:       inner_inner_loop_b:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_EXIT]]
+; CHECK:       inner_inner_loop_latch:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN]]
+; CHECK:       inner_inner_loop_exit:
+; CHECK-NEXT:    [[A2]] = load i32, i32* [[A_PTR]], align 4
+; CHECK-NEXT:    [[V4:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V4]], label [[INNER_LOOP_EXIT_LOOPEXIT1:%.*]], label [[INNER_LOOP_BEGIN]]
+; CHECK:       inner_loop_exit.loopexit.split:
+; CHECK-NEXT:    [[A_PHI_LCSSA2:%.*]] = phi i32 [ [[A_PHI]], [[INNER_INNER_LOOP_A]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT]]
+; CHECK:       inner_loop_exit.loopexit:
+; CHECK-NEXT:    [[DOTUS_PHI:%.*]] = phi i32 [ [[A_PHI_LCSSA2]], [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT]] ], [ [[A_PHI_LCSSA2_US]], [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT_US]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT:%.*]]
+; CHECK:       inner_loop_exit.loopexit1:
+; CHECK-NEXT:    [[A_PHI_LCSSA:%.*]] = phi i32 [ [[A_PHI]], [[INNER_INNER_LOOP_EXIT]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT]]
+; CHECK:       inner_loop_exit:
+; CHECK-NEXT:    [[A_PHI3:%.*]] = phi i32 [ [[A_PHI_LCSSA]], [[INNER_LOOP_EXIT_LOOPEXIT1]] ], [ [[DOTUS_PHI]], [[INNER_LOOP_EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    [[V5:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V5]], label [[LOOP_EXIT:%.*]], label [[LOOP_BEGIN]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[A_PHI3]], [[INNER_LOOP_EXIT]] ]
+; CHECK-NEXT:    ret i32 [[A_LCSSA]]
+;
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %a = load i32, i32* %a.ptr
+  br label %inner_loop_begin
+
+inner_loop_begin:
+  %a.phi = phi i32 [ %a, %loop_begin ], [ %a2, %inner_inner_loop_exit ]
+  %cond = load i1, i1* %cond.ptr
+  %b = load i32, i32* %b.ptr
+  br label %inner_inner_loop_begin
+
+inner_inner_loop_begin:
+  %v1 = load i1, i1* %ptr
+  br i1 %v1, label %inner_inner_loop_a, label %inner_inner_loop_b
+
+inner_inner_loop_a:
+  %v2 = load i1, i1* %ptr
+  br i1 %v2, label %inner_inner_loop_latch, label %inner_loop_exit
+
+inner_inner_loop_b:
+  br i1 %cond, label %inner_inner_loop_latch, label %inner_inner_loop_exit
+
+inner_inner_loop_latch:
+  br label %inner_inner_loop_begin
+; The cloned region is now an exit from the inner loop.
+; The original region exits the loop earlier.
+
+inner_inner_loop_exit:
+  %a2 = load i32, i32* %a.ptr
+  %v4 = load i1, i1* %ptr
+  br i1 %v4, label %inner_loop_exit, label %inner_loop_begin
+
+inner_loop_exit:
+  %v5 = load i1, i1* %ptr
+  br i1 %v5, label %loop_exit, label %loop_begin
+
+loop_exit:
+  %a.lcssa = phi i32 [ %a.phi, %inner_loop_exit ]
+  ret i32 %a.lcssa
+}
+
+; Same pattern as @test8a but where the original loop looses an exit block and
+; needs to be hoisted up the nest.
+define i32 @test8b(i1* %ptr, i1* %cond.ptr, i32* %a.ptr, i32* %b.ptr) {
+; CHECK-LABEL: @test8b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_loop_begin:
+; CHECK-NEXT:    [[A_PHI:%.*]] = phi i32 [ [[A]], [[LOOP_BEGIN]] ], [ [[A2:%.*]], [[INNER_INNER_LOOP_EXIT:%.*]] ]
+; CHECK-NEXT:    [[COND:%.*]] = load i1, i1* [[COND_PTR:%.*]], align 1
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_PTR:%.*]], align 4
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[INNER_LOOP_BEGIN_SPLIT_US:%.*]], label [[INNER_LOOP_BEGIN_SPLIT:%.*]]
+; CHECK:       inner_loop_begin.split.us:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN_US:%.*]]
+; CHECK:       inner_inner_loop_begin.us:
+; CHECK-NEXT:    [[V1_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V1_US]], label [[INNER_INNER_LOOP_A_US:%.*]], label [[INNER_INNER_LOOP_B_US:%.*]]
+; CHECK:       inner_inner_loop_b.us:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_EXIT_SPLIT_US:%.*]]
+; CHECK:       inner_inner_loop_a.us:
+; CHECK-NEXT:    [[V2_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2_US]], label [[INNER_INNER_LOOP_LATCH_US:%.*]], label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]]
+; CHECK:       inner_inner_loop_latch.us:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN_US]]
+; CHECK:       inner_inner_loop_exit.split.us:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_EXIT]]
+; CHECK:       inner_loop_exit.loopexit.split.us:
+; CHECK-NEXT:    [[A_PHI_LCSSA2_US:%.*]] = phi i32 [ [[A_PHI]], [[INNER_INNER_LOOP_A_US]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT:%.*]]
+; CHECK:       inner_loop_begin.split:
+; CHECK-NEXT:    [[A_PHI_LCSSA4:%.*]] = phi i32 [ [[A_PHI]], [[INNER_LOOP_BEGIN]] ]
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_inner_loop_begin:
+; CHECK-NEXT:    [[V1:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V1]], label [[INNER_INNER_LOOP_A:%.*]], label [[INNER_INNER_LOOP_B:%.*]]
+; CHECK:       inner_inner_loop_a:
+; CHECK-NEXT:    [[V2:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2]], label [[INNER_INNER_LOOP_LATCH:%.*]], label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT:%.*]]
+; CHECK:       inner_inner_loop_b:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_LATCH]]
+; CHECK:       inner_inner_loop_latch:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN]]
+; CHECK:       inner_inner_loop_exit:
+; CHECK-NEXT:    [[A2]] = load i32, i32* [[A_PTR]], align 4
+; CHECK-NEXT:    [[V4:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V4]], label [[INNER_LOOP_EXIT_LOOPEXIT1:%.*]], label [[INNER_LOOP_BEGIN]]
+; CHECK:       inner_loop_exit.loopexit.split:
+; CHECK-NEXT:    [[A_PHI_LCSSA2:%.*]] = phi i32 [ [[A_PHI_LCSSA4]], [[INNER_INNER_LOOP_A]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT]]
+; CHECK:       inner_loop_exit.loopexit:
+; CHECK-NEXT:    [[DOTUS_PHI:%.*]] = phi i32 [ [[A_PHI_LCSSA2]], [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT]] ], [ [[A_PHI_LCSSA2_US]], [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT_US]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT:%.*]]
+; CHECK:       inner_loop_exit.loopexit1:
+; CHECK-NEXT:    [[A_PHI_LCSSA:%.*]] = phi i32 [ [[A_PHI]], [[INNER_INNER_LOOP_EXIT]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT]]
+; CHECK:       inner_loop_exit:
+; CHECK-NEXT:    [[A_PHI3:%.*]] = phi i32 [ [[A_PHI_LCSSA]], [[INNER_LOOP_EXIT_LOOPEXIT1]] ], [ [[DOTUS_PHI]], [[INNER_LOOP_EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    [[V5:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V5]], label [[LOOP_EXIT:%.*]], label [[LOOP_BEGIN]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[A_PHI3]], [[INNER_LOOP_EXIT]] ]
+; CHECK-NEXT:    ret i32 [[A_LCSSA]]
+;
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %a = load i32, i32* %a.ptr
+  br label %inner_loop_begin
+
+inner_loop_begin:
+  %a.phi = phi i32 [ %a, %loop_begin ], [ %a2, %inner_inner_loop_exit ]
+  %cond = load i1, i1* %cond.ptr
+  %b = load i32, i32* %b.ptr
+  br label %inner_inner_loop_begin
+
+inner_inner_loop_begin:
+  %v1 = load i1, i1* %ptr
+  br i1 %v1, label %inner_inner_loop_a, label %inner_inner_loop_b
+
+inner_inner_loop_a:
+  %v2 = load i1, i1* %ptr
+  br i1 %v2, label %inner_inner_loop_latch, label %inner_loop_exit
+
+inner_inner_loop_b:
+  br i1 %cond, label %inner_inner_loop_exit, label %inner_inner_loop_latch
+
+inner_inner_loop_latch:
+  br label %inner_inner_loop_begin
+; The cloned region is similar to before but with one earlier exit.
+; The original region is now an exit in the preheader.
+
+inner_inner_loop_exit:
+  %a2 = load i32, i32* %a.ptr
+  %v4 = load i1, i1* %ptr
+  br i1 %v4, label %inner_loop_exit, label %inner_loop_begin
+
+inner_loop_exit:
+  %v5 = load i1, i1* %ptr
+  br i1 %v5, label %loop_exit, label %loop_begin
+
+loop_exit:
+  %a.lcssa = phi i32 [ %a.phi, %inner_loop_exit ]
+  ret i32 %a.lcssa
+}
+
+; Test that requires re-forming dedicated exits for the cloned loop.
+define i32 @test10a(i1* %ptr, i1 %cond, i32* %a.ptr) {
+; CHECK-LABEL: @test10a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND:%.*]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_BEGIN_US:%.*]]
+; CHECK:       loop_begin.us:
+; CHECK-NEXT:    [[A_US:%.*]] = load i32, i32* [[A_PTR:%.*]], align 4
+; CHECK-NEXT:    [[V1_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V1_US]], label [[LOOP_A_US:%.*]], label [[LOOP_B_US:%.*]]
+; CHECK:       loop_b.us:
+; CHECK-NEXT:    [[A_US_LCSSA:%.*]] = phi i32 [ [[A_US]], [[LOOP_BEGIN_US]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT_SPLIT_US:%.*]]
+; CHECK:       loop_a.us:
+; CHECK-NEXT:    [[V2_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2_US]], label [[LOOP_EXIT_SPLIT_US_LOOPEXIT:%.*]], label [[LOOP_BEGIN_BACKEDGE_US:%.*]]
+; CHECK:       loop_begin.backedge.us:
+; CHECK-NEXT:    br label [[LOOP_BEGIN_US]]
+; CHECK:       loop_exit.split.us.loopexit:
+; CHECK-NEXT:    [[A_LCSSA_US_PH:%.*]] = phi i32 [ [[A_US]], [[LOOP_A_US]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT_SPLIT_US]]
+; CHECK:       loop_exit.split.us:
+; CHECK-NEXT:    [[A_LCSSA_US:%.*]] = phi i32 [ [[A_US_LCSSA]], [[LOOP_B_US]] ], [ [[A_LCSSA_US_PH]], [[LOOP_EXIT_SPLIT_US_LOOPEXIT]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V1]], label [[LOOP_A:%.*]], label [[LOOP_B:%.*]]
+; CHECK:       loop_a:
+; CHECK-NEXT:    [[V2:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2]], label [[LOOP_EXIT_SPLIT:%.*]], label [[LOOP_BEGIN_BACKEDGE:%.*]]
+; CHECK:       loop_begin.backedge:
+; CHECK-NEXT:    br label [[LOOP_BEGIN]]
+; CHECK:       loop_b:
+; CHECK-NEXT:    br label [[LOOP_BEGIN_BACKEDGE]]
+; CHECK:       loop_exit.split:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[A]], [[LOOP_A]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[DOTUS_PHI:%.*]] = phi i32 [ [[A_LCSSA]], [[LOOP_EXIT_SPLIT]] ], [ [[A_LCSSA_US]], [[LOOP_EXIT_SPLIT_US]] ]
+; CHECK-NEXT:    ret i32 [[DOTUS_PHI]]
+;
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %a = load i32, i32* %a.ptr
+  %v1 = load i1, i1* %ptr
+  br i1 %v1, label %loop_a, label %loop_b
+
+loop_a:
+  %v2 = load i1, i1* %ptr
+  br i1 %v2, label %loop_exit, label %loop_begin
+
+loop_b:
+  br i1 %cond, label %loop_exit, label %loop_begin
+; The cloned loop with one edge as a direct exit.
+
+; The original loop without one 'loop_exit' edge.
+
+loop_exit:
+  %a.lcssa = phi i32 [ %a, %loop_a ], [ %a, %loop_b ]
+  ret i32 %a.lcssa
+}
+
+; Test that requires re-forming dedicated exits for the original loop.
+define i32 @test10b(i1* %ptr, i1 %cond, i32* %a.ptr) {
+; CHECK-LABEL: @test10b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND:%.*]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_BEGIN_US:%.*]]
+; CHECK:       loop_begin.us:
+; CHECK-NEXT:    [[A_US:%.*]] = load i32, i32* [[A_PTR:%.*]], align 4
+; CHECK-NEXT:    [[V1_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V1_US]], label [[LOOP_A_US:%.*]], label [[LOOP_B_US:%.*]]
+; CHECK:       loop_b.us:
+; CHECK-NEXT:    br label [[LOOP_BEGIN_BACKEDGE_US:%.*]]
+; CHECK:       loop_a.us:
+; CHECK-NEXT:    [[V2_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2_US]], label [[LOOP_BEGIN_BACKEDGE_US]], label [[LOOP_EXIT_SPLIT_US:%.*]]
+; CHECK:       loop_begin.backedge.us:
+; CHECK-NEXT:    br label [[LOOP_BEGIN_US]]
+; CHECK:       loop_exit.split.us:
+; CHECK-NEXT:    [[A_LCSSA_US:%.*]] = phi i32 [ [[A_US]], [[LOOP_A_US]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V1]], label [[LOOP_A:%.*]], label [[LOOP_B:%.*]]
+; CHECK:       loop_a:
+; CHECK-NEXT:    [[V2:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2]], label [[LOOP_BEGIN_BACKEDGE:%.*]], label [[LOOP_EXIT_SPLIT_LOOPEXIT:%.*]]
+; CHECK:       loop_begin.backedge:
+; CHECK-NEXT:    br label [[LOOP_BEGIN]]
+; CHECK:       loop_b:
+; CHECK-NEXT:    [[A_LCSSA1:%.*]] = phi i32 [ [[A]], [[LOOP_BEGIN]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT_SPLIT:%.*]]
+; CHECK:       loop_exit.split.loopexit:
+; CHECK-NEXT:    [[A_LCSSA_PH:%.*]] = phi i32 [ [[A]], [[LOOP_A]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT_SPLIT]]
+; CHECK:       loop_exit.split:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[A_LCSSA1]], [[LOOP_B]] ], [ [[A_LCSSA_PH]], [[LOOP_EXIT_SPLIT_LOOPEXIT]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[DOTUS_PHI:%.*]] = phi i32 [ [[A_LCSSA]], [[LOOP_EXIT_SPLIT]] ], [ [[A_LCSSA_US]], [[LOOP_EXIT_SPLIT_US]] ]
+; CHECK-NEXT:    ret i32 [[DOTUS_PHI]]
+;
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %a = load i32, i32* %a.ptr
+  %v1 = load i1, i1* %ptr
+  br i1 %v1, label %loop_a, label %loop_b
+
+loop_a:
+  %v2 = load i1, i1* %ptr
+  br i1 %v2, label %loop_begin, label %loop_exit
+
+loop_b:
+  br i1 %cond, label %loop_begin, label %loop_exit
+; The cloned loop without one of the exits.
+
+; The original loop without one 'loop_exit' edge.
+
+loop_exit:
+  %a.lcssa = phi i32 [ %a, %loop_a ], [ %a, %loop_b ]
+  ret i32 %a.lcssa
+}
+
+; Check that if a cloned inner loop after unswitching doesn't loop and directly
+; exits even an outer loop, we don't add the cloned preheader to the outer
+; loop and do add the needed LCSSA phi nodes for the new exit block from the
+; outer loop.
+define i32 @test11a(i1* %ptr, i1* %cond.ptr, i32* %a.ptr, i32* %b.ptr) {
+; CHECK-LABEL: @test11a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_PTR:%.*]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V1]], label [[LOOP_LATCH:%.*]], label [[INNER_LOOP_PH:%.*]]
+; CHECK:       inner_loop_ph:
+; CHECK-NEXT:    [[COND:%.*]] = load i1, i1* [[COND_PTR:%.*]], align 1
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[INNER_LOOP_PH_SPLIT_US:%.*]], label [[INNER_LOOP_PH_SPLIT:%.*]]
+; CHECK:       inner_loop_ph.split.us:
+; CHECK-NEXT:    [[B_LCSSA:%.*]] = phi i32 [ [[B]], [[INNER_LOOP_PH]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_BEGIN_US:%.*]]
+; CHECK:       inner_loop_begin.us:
+; CHECK-NEXT:    call void @sink1(i32 [[B_LCSSA]])
+; CHECK-NEXT:    [[A_US:%.*]] = load i32, i32* [[A_PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]]
+; CHECK:       loop_exit.loopexit.split.us:
+; CHECK-NEXT:    [[A_LCSSA2_US:%.*]] = phi i32 [ [[A_US]], [[INNER_LOOP_BEGIN_US]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXIT:%.*]]
+; CHECK:       inner_loop_ph.split:
+; CHECK-NEXT:    br label [[INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_loop_begin:
+; CHECK-NEXT:    call void @sink1(i32 [[B]])
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_PTR]], align 4
+; CHECK-NEXT:    br label [[INNER_LOOP_A:%.*]]
+; CHECK:       inner_loop_a:
+; CHECK-NEXT:    [[V2:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2]], label [[INNER_LOOP_EXIT:%.*]], label [[INNER_LOOP_BEGIN]]
+; CHECK:       inner_loop_exit:
+; CHECK-NEXT:    [[A_INNER_LCSSA:%.*]] = phi i32 [ [[A]], [[INNER_LOOP_A]] ]
+; CHECK-NEXT:    [[V3:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V3]], label [[LOOP_LATCH]], label [[LOOP_EXIT_LOOPEXIT1:%.*]]
+; CHECK:       loop_latch:
+; CHECK-NEXT:    br label [[LOOP_BEGIN]]
+; CHECK:       loop_exit.loopexit:
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
+; CHECK:       loop_exit.loopexit1:
+; CHECK-NEXT:    [[A_INNER_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_INNER_LCSSA]], [[INNER_LOOP_EXIT]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[A_LCSSA2_US]], [[LOOP_EXIT_LOOPEXIT]] ], [ [[A_INNER_LCSSA_LCSSA]], [[LOOP_EXIT_LOOPEXIT1]] ]
+; CHECK-NEXT:    ret i32 [[A_LCSSA]]
+;
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %b = load i32, i32* %b.ptr
+  %v1 = load i1, i1* %ptr
+  br i1 %v1, label %loop_latch, label %inner_loop_ph
+
+inner_loop_ph:
+  %cond = load i1, i1* %cond.ptr
+  br label %inner_loop_begin
+
+inner_loop_begin:
+  call void @sink1(i32 %b)
+  %a = load i32, i32* %a.ptr
+  br i1 %cond, label %loop_exit, label %inner_loop_a
+
+inner_loop_a:
+  %v2 = load i1, i1* %ptr
+  br i1 %v2, label %inner_loop_exit, label %inner_loop_begin
+; The cloned path doesn't actually loop and is an exit from the outer loop as
+; well.
+; The original remains a loop losing the exit edge.
+
+inner_loop_exit:
+  %a.inner_lcssa = phi i32 [ %a, %inner_loop_a ]
+  %v3 = load i1, i1* %ptr
+  br i1 %v3, label %loop_latch, label %loop_exit
+
+loop_latch:
+  br label %loop_begin
+
+loop_exit:
+  %a.lcssa = phi i32 [ %a, %inner_loop_begin ], [ %a.inner_lcssa, %inner_loop_exit ]
+  ret i32 %a.lcssa
+}
+
+; Check that if the original inner loop after unswitching doesn't loop and
+; directly exits even an outer loop, we remove the original preheader from the
+; outer loop and add needed LCSSA phi nodes for the new exit block from the
+; outer loop.
+define i32 @test11b(i1* %ptr, i1* %cond.ptr, i32* %a.ptr, i32* %b.ptr) {
+; CHECK-LABEL: @test11b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_PTR:%.*]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V1]], label [[LOOP_LATCH:%.*]], label [[INNER_LOOP_PH:%.*]]
+; CHECK:       inner_loop_ph:
+; CHECK-NEXT:    [[COND:%.*]] = load i1, i1* [[COND_PTR:%.*]], align 1
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[INNER_LOOP_PH_SPLIT_US:%.*]], label [[INNER_LOOP_PH_SPLIT:%.*]]
+; CHECK:       inner_loop_ph.split.us:
+; CHECK-NEXT:    br label [[INNER_LOOP_BEGIN_US:%.*]]
+; CHECK:       inner_loop_begin.us:
+; CHECK-NEXT:    call void @sink1(i32 [[B]])
+; CHECK-NEXT:    [[A_US:%.*]] = load i32, i32* [[A_PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[INNER_LOOP_A_US:%.*]]
+; CHECK:       inner_loop_a.us:
+; CHECK-NEXT:    [[V2_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2_US]], label [[INNER_LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_LOOP_BEGIN_US]]
+; CHECK:       inner_loop_exit.split.us:
+; CHECK-NEXT:    [[A_INNER_LCSSA_US:%.*]] = phi i32 [ [[A_US]], [[INNER_LOOP_A_US]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT:%.*]]
+; CHECK:       inner_loop_ph.split:
+; CHECK-NEXT:    [[B_LCSSA:%.*]] = phi i32 [ [[B]], [[INNER_LOOP_PH]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_loop_begin:
+; CHECK-NEXT:    call void @sink1(i32 [[B_LCSSA]])
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_PTR]], align 4
+; CHECK-NEXT:    br label [[LOOP_EXIT_LOOPEXIT:%.*]]
+; CHECK:       inner_loop_exit:
+; CHECK-NEXT:    [[V3:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V3]], label [[LOOP_LATCH]], label [[LOOP_EXIT_LOOPEXIT1:%.*]]
+; CHECK:       loop_latch:
+; CHECK-NEXT:    br label [[LOOP_BEGIN]]
+; CHECK:       loop_exit.loopexit:
+; CHECK-NEXT:    [[A_LCSSA2:%.*]] = phi i32 [ [[A]], [[INNER_LOOP_BEGIN]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
+; CHECK:       loop_exit.loopexit1:
+; CHECK-NEXT:    [[A_INNER_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_INNER_LCSSA_US]], [[INNER_LOOP_EXIT]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[A_LCSSA2]], [[LOOP_EXIT_LOOPEXIT]] ], [ [[A_INNER_LCSSA_LCSSA]], [[LOOP_EXIT_LOOPEXIT1]] ]
+; CHECK-NEXT:    ret i32 [[A_LCSSA]]
+;
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %b = load i32, i32* %b.ptr
+  %v1 = load i1, i1* %ptr
+  br i1 %v1, label %loop_latch, label %inner_loop_ph
+
+inner_loop_ph:
+  %cond = load i1, i1* %cond.ptr
+  br label %inner_loop_begin
+
+inner_loop_begin:
+  call void @sink1(i32 %b)
+  %a = load i32, i32* %a.ptr
+  br i1 %cond, label %inner_loop_a, label %loop_exit
+
+inner_loop_a:
+  %v2 = load i1, i1* %ptr
+  br i1 %v2, label %inner_loop_exit, label %inner_loop_begin
+; The cloned path continues to loop without the exit out of the entire nest.
+; The original remains a loop losing the exit edge.
+
+inner_loop_exit:
+  %a.inner_lcssa = phi i32 [ %a, %inner_loop_a ]
+  %v3 = load i1, i1* %ptr
+  br i1 %v3, label %loop_latch, label %loop_exit
+
+loop_latch:
+  br label %loop_begin
+
+loop_exit:
+  %a.lcssa = phi i32 [ %a, %inner_loop_begin ], [ %a.inner_lcssa, %inner_loop_exit ]
+  ret i32 %a.lcssa
+}
+
+; Like test11a, but checking that when the whole thing is wrapped in yet
+; another loop, we correctly attribute the cloned preheader to that outermost
+; loop rather than only handling the case where the preheader is not in any loop
+; at all.
+define i32 @test12a(i1* %ptr, i1* %cond.ptr, i32* %a.ptr, i32* %b.ptr) {
+; CHECK-LABEL: @test12a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    br label [[INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_loop_begin:
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_PTR:%.*]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V1]], label [[INNER_LOOP_LATCH:%.*]], label [[INNER_INNER_LOOP_PH:%.*]]
+; CHECK:       inner_inner_loop_ph:
+; CHECK-NEXT:    [[COND:%.*]] = load i1, i1* [[COND_PTR:%.*]], align 1
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[INNER_INNER_LOOP_PH_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_PH_SPLIT:%.*]]
+; CHECK:       inner_inner_loop_ph.split.us:
+; CHECK-NEXT:    [[B_LCSSA:%.*]] = phi i32 [ [[B]], [[INNER_INNER_LOOP_PH]] ]
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN_US:%.*]]
+; CHECK:       inner_inner_loop_begin.us:
+; CHECK-NEXT:    call void @sink1(i32 [[B_LCSSA]])
+; CHECK-NEXT:    [[A_US:%.*]] = load i32, i32* [[A_PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]]
+; CHECK:       inner_loop_exit.loopexit.split.us:
+; CHECK-NEXT:    [[A_LCSSA2_US:%.*]] = phi i32 [ [[A_US]], [[INNER_INNER_LOOP_BEGIN_US]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT:%.*]]
+; CHECK:       inner_inner_loop_ph.split:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_inner_loop_begin:
+; CHECK-NEXT:    call void @sink1(i32 [[B]])
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_PTR]], align 4
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_A:%.*]]
+; CHECK:       inner_inner_loop_a:
+; CHECK-NEXT:    [[V2:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2]], label [[INNER_INNER_LOOP_EXIT:%.*]], label [[INNER_INNER_LOOP_BEGIN]]
+; CHECK:       inner_inner_loop_exit:
+; CHECK-NEXT:    [[A_INNER_INNER_LCSSA:%.*]] = phi i32 [ [[A]], [[INNER_INNER_LOOP_A]] ]
+; CHECK-NEXT:    [[V3:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V3]], label [[INNER_LOOP_LATCH]], label [[INNER_LOOP_EXIT_LOOPEXIT1:%.*]]
+; CHECK:       inner_loop_latch:
+; CHECK-NEXT:    br label [[INNER_LOOP_BEGIN]]
+; CHECK:       inner_loop_exit.loopexit:
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT:%.*]]
+; CHECK:       inner_loop_exit.loopexit1:
+; CHECK-NEXT:    [[A_INNER_INNER_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_INNER_INNER_LCSSA]], [[INNER_INNER_LOOP_EXIT]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT]]
+; CHECK:       inner_loop_exit:
+; CHECK-NEXT:    [[A_INNER_LCSSA:%.*]] = phi i32 [ [[A_LCSSA2_US]], [[INNER_LOOP_EXIT_LOOPEXIT]] ], [ [[A_INNER_INNER_LCSSA_LCSSA]], [[INNER_LOOP_EXIT_LOOPEXIT1]] ]
+; CHECK-NEXT:    [[V4:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V4]], label [[LOOP_BEGIN]], label [[LOOP_EXIT:%.*]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[A_INNER_LCSSA]], [[INNER_LOOP_EXIT]] ]
+; CHECK-NEXT:    ret i32 [[A_LCSSA]]
+;
+entry:
+  br label %loop_begin
+
+loop_begin:
+  br label %inner_loop_begin
+
+inner_loop_begin:
+  %b = load i32, i32* %b.ptr
+  %v1 = load i1, i1* %ptr
+  br i1 %v1, label %inner_loop_latch, label %inner_inner_loop_ph
+
+inner_inner_loop_ph:
+  %cond = load i1, i1* %cond.ptr
+  br label %inner_inner_loop_begin
+
+inner_inner_loop_begin:
+  call void @sink1(i32 %b)
+  %a = load i32, i32* %a.ptr
+  br i1 %cond, label %inner_loop_exit, label %inner_inner_loop_a
+
+inner_inner_loop_a:
+  %v2 = load i1, i1* %ptr
+  br i1 %v2, label %inner_inner_loop_exit, label %inner_inner_loop_begin
+; The cloned path doesn't actually loop and is an exit from the outer loop as
+; well.
+; The original remains a loop losing the exit edge.
+
+inner_inner_loop_exit:
+  %a.inner_inner_lcssa = phi i32 [ %a, %inner_inner_loop_a ]
+  %v3 = load i1, i1* %ptr
+  br i1 %v3, label %inner_loop_latch, label %inner_loop_exit
+
+inner_loop_latch:
+  br label %inner_loop_begin
+
+inner_loop_exit:
+  %a.inner_lcssa = phi i32 [ %a, %inner_inner_loop_begin ], [ %a.inner_inner_lcssa, %inner_inner_loop_exit ]
+  %v4 = load i1, i1* %ptr
+  br i1 %v4, label %loop_begin, label %loop_exit
+
+loop_exit:
+  %a.lcssa = phi i32 [ %a.inner_lcssa, %inner_loop_exit ]
+  ret i32 %a.lcssa
+}
+
+; Like test11b, but checking that when the whole thing is wrapped in yet
+; another loop, we correctly sink the preheader to the outermost loop rather
+; than only handling the case where the preheader is completely removed from
+; a loop.
+define i32 @test12b(i1* %ptr, i1* %cond.ptr, i32* %a.ptr, i32* %b.ptr) {
+; CHECK-LABEL: @test12b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    br label [[INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_loop_begin:
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_PTR:%.*]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V1]], label [[INNER_LOOP_LATCH:%.*]], label [[INNER_INNER_LOOP_PH:%.*]]
+; CHECK:       inner_inner_loop_ph:
+; CHECK-NEXT:    [[COND:%.*]] = load i1, i1* [[COND_PTR:%.*]], align 1
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[INNER_INNER_LOOP_PH_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_PH_SPLIT:%.*]]
+; CHECK:       inner_inner_loop_ph.split.us:
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN_US:%.*]]
+; CHECK:       inner_inner_loop_begin.us:
+; CHECK-NEXT:    call void @sink1(i32 [[B]])
+; CHECK-NEXT:    [[A_US:%.*]] = load i32, i32* [[A_PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_A_US:%.*]]
+; CHECK:       inner_inner_loop_a.us:
+; CHECK-NEXT:    [[V2_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2_US]], label [[INNER_INNER_LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_BEGIN_US]]
+; CHECK:       inner_inner_loop_exit.split.us:
+; CHECK-NEXT:    [[A_INNER_INNER_LCSSA_US:%.*]] = phi i32 [ [[A_US]], [[INNER_INNER_LOOP_A_US]] ]
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_EXIT:%.*]]
+; CHECK:       inner_inner_loop_ph.split:
+; CHECK-NEXT:    [[B_LCSSA:%.*]] = phi i32 [ [[B]], [[INNER_INNER_LOOP_PH]] ]
+; CHECK-NEXT:    br label [[INNER_INNER_LOOP_BEGIN:%.*]]
+; CHECK:       inner_inner_loop_begin:
+; CHECK-NEXT:    call void @sink1(i32 [[B_LCSSA]])
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_PTR]], align 4
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT_LOOPEXIT:%.*]]
+; CHECK:       inner_inner_loop_exit:
+; CHECK-NEXT:    [[V3:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V3]], label [[INNER_LOOP_LATCH]], label [[INNER_LOOP_EXIT_LOOPEXIT1:%.*]]
+; CHECK:       inner_loop_latch:
+; CHECK-NEXT:    br label [[INNER_LOOP_BEGIN]]
+; CHECK:       inner_loop_exit.loopexit:
+; CHECK-NEXT:    [[A_LCSSA2:%.*]] = phi i32 [ [[A]], [[INNER_INNER_LOOP_BEGIN]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT:%.*]]
+; CHECK:       inner_loop_exit.loopexit1:
+; CHECK-NEXT:    [[A_INNER_INNER_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_INNER_INNER_LCSSA_US]], [[INNER_INNER_LOOP_EXIT]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP_EXIT]]
+; CHECK:       inner_loop_exit:
+; CHECK-NEXT:    [[A_INNER_LCSSA:%.*]] = phi i32 [ [[A_LCSSA2]], [[INNER_LOOP_EXIT_LOOPEXIT]] ], [ [[A_INNER_INNER_LCSSA_LCSSA]], [[INNER_LOOP_EXIT_LOOPEXIT1]] ]
+; CHECK-NEXT:    [[V4:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V4]], label [[LOOP_BEGIN]], label [[LOOP_EXIT:%.*]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[A_LCSSA:%.*]] = phi i32 [ [[A_INNER_LCSSA]], [[INNER_LOOP_EXIT]] ]
+; CHECK-NEXT:    ret i32 [[A_LCSSA]]
+;
+entry:
+  br label %loop_begin
+
+loop_begin:
+  br label %inner_loop_begin
+
+inner_loop_begin:
+  %b = load i32, i32* %b.ptr
+  %v1 = load i1, i1* %ptr
+  br i1 %v1, label %inner_loop_latch, label %inner_inner_loop_ph
+
+inner_inner_loop_ph:
+  %cond = load i1, i1* %cond.ptr
+  br label %inner_inner_loop_begin
+
+inner_inner_loop_begin:
+  call void @sink1(i32 %b)
+  %a = load i32, i32* %a.ptr
+  br i1 %cond, label %inner_inner_loop_a, label %inner_loop_exit
+
+inner_inner_loop_a:
+  %v2 = load i1, i1* %ptr
+  br i1 %v2, label %inner_inner_loop_exit, label %inner_inner_loop_begin
+; The cloned path continues to loop without the exit out of the entire nest.
+; The original remains a loop losing the exit edge.
+
+inner_inner_loop_exit:
+  %a.inner_inner_lcssa = phi i32 [ %a, %inner_inner_loop_a ]
+  %v3 = load i1, i1* %ptr
+  br i1 %v3, label %inner_loop_latch, label %inner_loop_exit
+
+inner_loop_latch:
+  br label %inner_loop_begin
+
+inner_loop_exit:
+  %a.inner_lcssa = phi i32 [ %a, %inner_inner_loop_begin ], [ %a.inner_inner_lcssa, %inner_inner_loop_exit ]
+  %v4 = load i1, i1* %ptr
+  br i1 %v4, label %loop_begin, label %loop_exit
+
+loop_exit:
+  %a.lcssa = phi i32 [ %a.inner_lcssa, %inner_loop_exit ]
+  ret i32 %a.lcssa
+}
+
+; Test where the cloned loop has an inner loop that has to be traversed to form
+; the cloned loop, and where this inner loop has multiple blocks, and where the
+; exiting block that connects the inner loop to the cloned loop is not the header
+; block. This ensures that we correctly handle interesting corner cases of
+; traversing back to the header when establishing the cloned loop.
+define i32 @test13a(i1* %ptr, i1 %cond, i32* %a.ptr, i32* %b.ptr) {
+; CHECK-LABEL: @test13a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND:%.*]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_BEGIN_US:%.*]]
+; CHECK:       loop_begin.us:
+; CHECK-NEXT:    [[A_US:%.*]] = load i32, i32* [[A_PTR:%.*]], align 4
+; CHECK-NEXT:    [[V1_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V1_US]], label [[LOOP_A_US:%.*]], label [[LOOP_B_US:%.*]]
+; CHECK:       loop_b.us:
+; CHECK-NEXT:    [[B_US:%.*]] = load i32, i32* [[B_PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[LOOP_B_INNER_PH_US:%.*]]
+; CHECK:       loop_b_inner_ph.us:
+; CHECK-NEXT:    br label [[LOOP_B_INNER_HEADER_US:%.*]]
+; CHECK:       loop_b_inner_header.us:
+; CHECK-NEXT:    [[V3_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V3_US]], label [[LOOP_B_INNER_LATCH_US:%.*]], label [[LOOP_B_INNER_BODY_US:%.*]]
+; CHECK:       loop_b_inner_body.us:
+; CHECK-NEXT:    [[V4_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V4_US]], label [[LOOP_B_INNER_LATCH_US]], label [[LOOP_B_INNER_EXIT_US:%.*]]
+; CHECK:       loop_b_inner_exit.us:
+; CHECK-NEXT:    br label [[LOOP_LATCH_US:%.*]]
+; CHECK:       loop_b_inner_latch.us:
+; CHECK-NEXT:    br label [[LOOP_B_INNER_HEADER_US]]
+; CHECK:       loop_a.us:
+; CHECK-NEXT:    [[V2_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2_US]], label [[LOOP_EXIT_SPLIT_US:%.*]], label [[LOOP_LATCH_US]]
+; CHECK:       loop_latch.us:
+; CHECK-NEXT:    br label [[LOOP_BEGIN_US]]
+; CHECK:       loop_exit.split.us:
+; CHECK-NEXT:    [[LCSSA_US:%.*]] = phi i32 [ [[A_US]], [[LOOP_A_US]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V1]], label [[LOOP_A:%.*]], label [[LOOP_B:%.*]]
+; CHECK:       loop_a:
+; CHECK-NEXT:    [[V2:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2]], label [[LOOP_EXIT_SPLIT_LOOPEXIT:%.*]], label [[LOOP_LATCH:%.*]]
+; CHECK:       loop_b:
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_PTR]], align 4
+; CHECK-NEXT:    br label [[LOOP_EXIT_SPLIT:%.*]]
+; CHECK:       loop_latch:
+; CHECK-NEXT:    br label [[LOOP_BEGIN]]
+; CHECK:       loop_exit.split.loopexit:
+; CHECK-NEXT:    [[LCSSA_PH:%.*]] = phi i32 [ [[A]], [[LOOP_A]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT_SPLIT]]
+; CHECK:       loop_exit.split:
+; CHECK-NEXT:    [[LCSSA:%.*]] = phi i32 [ [[B]], [[LOOP_B]] ], [ [[LCSSA_PH]], [[LOOP_EXIT_SPLIT_LOOPEXIT]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[DOTUS_PHI:%.*]] = phi i32 [ [[LCSSA]], [[LOOP_EXIT_SPLIT]] ], [ [[LCSSA_US]], [[LOOP_EXIT_SPLIT_US]] ]
+; CHECK-NEXT:    ret i32 [[DOTUS_PHI]]
+;
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %a = load i32, i32* %a.ptr
+  %v1 = load i1, i1* %ptr
+  br i1 %v1, label %loop_a, label %loop_b
+
+loop_a:
+  %v2 = load i1, i1* %ptr
+  br i1 %v2, label %loop_exit, label %loop_latch
+
+loop_b:
+  %b = load i32, i32* %b.ptr
+  br i1 %cond, label %loop_b_inner_ph, label %loop_exit
+
+loop_b_inner_ph:
+  br label %loop_b_inner_header
+
+loop_b_inner_header:
+  %v3 = load i1, i1* %ptr
+  br i1 %v3, label %loop_b_inner_latch, label %loop_b_inner_body
+
+loop_b_inner_body:
+  %v4 = load i1, i1* %ptr
+  br i1 %v4, label %loop_b_inner_latch, label %loop_b_inner_exit
+
+loop_b_inner_latch:
+  br label %loop_b_inner_header
+
+loop_b_inner_exit:
+  br label %loop_latch
+
+loop_latch:
+  br label %loop_begin
+; The cloned loop contains an inner loop within it.
+; And the original loop no longer contains an inner loop.
+
+loop_exit:
+  %lcssa = phi i32 [ %a, %loop_a ], [ %b, %loop_b ]
+  ret i32 %lcssa
+}
+
+; Test where the original loop has an inner loop that has to be traversed to
+; rebuild the loop, and where this inner loop has multiple blocks, and where
+; the exiting block that connects the inner loop to the original loop is not
+; the header block. This ensures that we correctly handle interesting corner
+; cases of traversing back to the header when re-establishing the original loop
+; still exists after unswitching.
+define i32 @test13b(i1* %ptr, i1 %cond, i32* %a.ptr, i32* %b.ptr) {
+; CHECK-LABEL: @test13b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND:%.*]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[LOOP_BEGIN_US:%.*]]
+; CHECK:       loop_begin.us:
+; CHECK-NEXT:    [[A_US:%.*]] = load i32, i32* [[A_PTR:%.*]], align 4
+; CHECK-NEXT:    [[V1_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[V1_US]], label [[LOOP_A_US:%.*]], label [[LOOP_B_US:%.*]]
+; CHECK:       loop_b.us:
+; CHECK-NEXT:    [[B_US:%.*]] = load i32, i32* [[B_PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[LOOP_EXIT_SPLIT_US:%.*]]
+; CHECK:       loop_a.us:
+; CHECK-NEXT:    [[V2_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2_US]], label [[LOOP_EXIT_SPLIT_US_LOOPEXIT:%.*]], label [[LOOP_LATCH_US:%.*]]
+; CHECK:       loop_latch.us:
+; CHECK-NEXT:    br label [[LOOP_BEGIN_US]]
+; CHECK:       loop_exit.split.us.loopexit:
+; CHECK-NEXT:    [[LCSSA_US_PH:%.*]] = phi i32 [ [[A_US]], [[LOOP_A_US]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT_SPLIT_US]]
+; CHECK:       loop_exit.split.us:
+; CHECK-NEXT:    [[LCSSA_US:%.*]] = phi i32 [ [[B_US]], [[LOOP_B_US]] ], [ [[LCSSA_US_PH]], [[LOOP_EXIT_SPLIT_US_LOOPEXIT]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[LOOP_BEGIN:%.*]]
+; CHECK:       loop_begin:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* [[A_PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V1]], label [[LOOP_A:%.*]], label [[LOOP_B:%.*]]
+; CHECK:       loop_a:
+; CHECK-NEXT:    [[V2:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V2]], label [[LOOP_EXIT_SPLIT:%.*]], label [[LOOP_LATCH:%.*]]
+; CHECK:       loop_b:
+; CHECK-NEXT:    [[B:%.*]] = load i32, i32* [[B_PTR]], align 4
+; CHECK-NEXT:    br label [[LOOP_B_INNER_PH:%.*]]
+; CHECK:       loop_b_inner_ph:
+; CHECK-NEXT:    br label [[LOOP_B_INNER_HEADER:%.*]]
+; CHECK:       loop_b_inner_header:
+; CHECK-NEXT:    [[V3:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V3]], label [[LOOP_B_INNER_LATCH:%.*]], label [[LOOP_B_INNER_BODY:%.*]]
+; CHECK:       loop_b_inner_body:
+; CHECK-NEXT:    [[V4:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[V4]], label [[LOOP_B_INNER_LATCH]], label [[LOOP_B_INNER_EXIT:%.*]]
+; CHECK:       loop_b_inner_latch:
+; CHECK-NEXT:    br label [[LOOP_B_INNER_HEADER]]
+; CHECK:       loop_b_inner_exit:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop_latch:
+; CHECK-NEXT:    br label [[LOOP_BEGIN]]
+; CHECK:       loop_exit.split:
+; CHECK-NEXT:    [[LCSSA:%.*]] = phi i32 [ [[A]], [[LOOP_A]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT]]
+; CHECK:       loop_exit:
+; CHECK-NEXT:    [[DOTUS_PHI:%.*]] = phi i32 [ [[LCSSA]], [[LOOP_EXIT_SPLIT]] ], [ [[LCSSA_US]], [[LOOP_EXIT_SPLIT_US]] ]
+; CHECK-NEXT:    ret i32 [[DOTUS_PHI]]
+;
+entry:
+  br label %loop_begin
+
+loop_begin:
+  %a = load i32, i32* %a.ptr
+  %v1 = load i1, i1* %ptr
+  br i1 %v1, label %loop_a, label %loop_b
+
+loop_a:
+  %v2 = load i1, i1* %ptr
+  br i1 %v2, label %loop_exit, label %loop_latch
+
+loop_b:
+  %b = load i32, i32* %b.ptr
+  br i1 %cond, label %loop_exit, label %loop_b_inner_ph
+
+loop_b_inner_ph:
+  br label %loop_b_inner_header
+
+loop_b_inner_header:
+  %v3 = load i1, i1* %ptr
+  br i1 %v3, label %loop_b_inner_latch, label %loop_b_inner_body
+
+loop_b_inner_body:
+  %v4 = load i1, i1* %ptr
+  br i1 %v4, label %loop_b_inner_latch, label %loop_b_inner_exit
+
+loop_b_inner_latch:
+  br label %loop_b_inner_header
+
+loop_b_inner_exit:
+  br label %loop_latch
+
+loop_latch:
+  br label %loop_begin
+; The cloned loop doesn't contain an inner loop.
+; But the original loop contains an inner loop that must be traversed.;
+
+loop_exit:
+  %lcssa = phi i32 [ %a, %loop_a ], [ %b, %loop_b ]
+  ret i32 %lcssa
+}
+
+; A test reduced out of 400.perlbench that when unswitching the `%stop`
+; condition clones a loop nest outside of a containing loop. This excercises a
+; 
diff erent cloning path from our other test cases and in turn verifying the
+; resulting structure can catch any failures to correctly clone these nested
+; loops.
+declare void @f()
+declare void @g()
+declare i32 @h(i32 %arg)
+
+; Test that when we are unswitching and need to rebuild the loop block set we
+; correctly skip past inner loops. We want to use the inner loop to efficiently
+; skip whole subregions of the outer loop blocks but just because the header of
+; the outer loop is also the preheader of an inner loop shouldn't confuse this
+; walk.
+define void @test23(i1 %arg, i1* %ptr) {
+; CHECK-LABEL: @test23(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARG_FR:%.*]] = freeze i1 [[ARG:%.*]]
+; CHECK-NEXT:    br i1 [[ARG_FR]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[OUTER_HEADER_US:%.*]]
+; CHECK:       outer.header.us:
+; CHECK-NEXT:    br label [[INNER_HEADER_US:%.*]]
+; CHECK:       inner.header.us:
+; CHECK-NEXT:    call void @f()
+; CHECK-NEXT:    br label [[INNER_LATCH_US:%.*]]
+; CHECK:       inner.latch.us:
+; CHECK-NEXT:    [[INNER_COND_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1
+; CHECK-NEXT:    br i1 [[INNER_COND_US]], label [[INNER_HEADER_US]], label [[OUTER_BODY_US:%.*]]
+; CHECK:       outer.body.us:
+; CHECK-NEXT:    br label [[OUTER_BODY_LEFT_US:%.*]]
+; CHECK:       outer.body.left.us:
+; CHECK-NEXT:    call void @f()
+; CHECK-NEXT:    br label [[OUTER_LATCH_US:%.*]]
+; CHECK:       outer.latch.us:
+; CHECK-NEXT:    [[OUTER_COND_US:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[OUTER_COND_US]], label [[OUTER_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK:       exit.split.us:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
+; CHECK:       inner.header:
+; CHECK-NEXT:    call void @f()
+; CHECK-NEXT:    br label [[INNER_LATCH:%.*]]
+; CHECK:       inner.latch:
+; CHECK-NEXT:    [[INNER_COND:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER]], label [[OUTER_BODY:%.*]]
+; CHECK:       outer.body:
+; CHECK-NEXT:    br label [[OUTER_BODY_RIGHT:%.*]]
+; CHECK:       outer.body.right:
+; CHECK-NEXT:    call void @g()
+; CHECK-NEXT:    br label [[OUTER_LATCH:%.*]]
+; CHECK:       outer.latch:
+; CHECK-NEXT:    [[OUTER_COND:%.*]] = load i1, i1* [[PTR]], align 1
+; CHECK-NEXT:    br i1 [[OUTER_COND]], label [[OUTER_HEADER]], label [[EXIT_SPLIT:%.*]]
+; CHECK:       exit.split:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+; Just verify that we unswitched the correct bits. We should call `@f` twice in
+; one unswitch and `@f` and then `@g` in the other.
+
+outer.header:
+  br label %inner.header
+
+inner.header:
+  call void @f()
+  br label %inner.latch
+
+inner.latch:
+  %inner.cond = load i1, i1* %ptr
+  br i1 %inner.cond, label %inner.header, label %outer.body
+
+outer.body:
+  br i1 %arg, label %outer.body.left, label %outer.body.right
+
+outer.body.left:
+  call void @f()
+  br label %outer.latch
+
+outer.body.right:
+  call void @g()
+  br label %outer.latch
+
+outer.latch:
+  %outer.cond = load i1, i1* %ptr
+  br i1 %outer.cond, label %outer.header, label %exit
+
+exit:
+  ret void
+}
+
+; A test case designed to exercise unusual properties of switches: they
+; can introduce multiple edges to successors. These need lots of special case
+; handling as they get collapsed in many cases (domtree, the unswitch itself)
+; but not in all cases (the PHI node operands).
+define i32 @test29(i32 %arg) {
+; CHECK-LABEL: @test29(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARG_FR:%.*]] = freeze i32 [[ARG:%.*]]
+; CHECK-NEXT:    switch i32 [[ARG_FR]], label [[ENTRY_SPLIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[ENTRY_SPLIT_US:%.*]]
+; CHECK-NEXT:    i32 1, label [[ENTRY_SPLIT_US]]
+; CHECK-NEXT:    i32 2, label [[ENTRY_SPLIT_US1:%.*]]
+; CHECK-NEXT:    i32 3, label [[ENTRY_SPLIT]]
+; CHECK-NEXT:    ]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[HEADER_US:%.*]]
+; CHECK:       header.us:
+; CHECK-NEXT:    [[TMP_US:%.*]] = call i32 @d()
+; CHECK-NEXT:    [[CMP1_US:%.*]] = icmp eq i32 [[TMP_US]], 0
+; CHECK-NEXT:    br i1 [[CMP1_US]], label [[BODY_A_US:%.*]], label [[DISPATCH_US:%.*]]
+; CHECK:       dispatch.us:
+; CHECK-NEXT:    br label [[BODY_A_US]]
+; CHECK:       body.a.us:
+; CHECK-NEXT:    [[TMP_A_PHI_US:%.*]] = phi i32 [ 0, [[HEADER_US]] ], [ [[TMP_US]], [[DISPATCH_US]] ]
+; CHECK-NEXT:    [[TMP_A_US:%.*]] = call i32 @a()
+; CHECK-NEXT:    [[TMP_A_SUM_US:%.*]] = add i32 [[TMP_A_PHI_US]], [[TMP_A_US]]
+; CHECK-NEXT:    br label [[BODY_B_US:%.*]]
+; CHECK:       body.b.us:
+; CHECK-NEXT:    [[TMP_B_PHI_US:%.*]] = phi i32 [ [[TMP_A_SUM_US]], [[BODY_A_US]] ]
+; CHECK-NEXT:    [[TMP_B_US:%.*]] = call i32 @b()
+; CHECK-NEXT:    [[TMP_B_SUM_US:%.*]] = add i32 [[TMP_B_PHI_US]], [[TMP_B_US]]
+; CHECK-NEXT:    br label [[BODY_C_US:%.*]]
+; CHECK:       body.c.us:
+; CHECK-NEXT:    [[TMP_C_PHI_US:%.*]] = phi i32 [ [[TMP_B_SUM_US]], [[BODY_B_US]] ]
+; CHECK-NEXT:    [[TMP_C_US:%.*]] = call i32 @c()
+; CHECK-NEXT:    [[TMP_C_SUM_US:%.*]] = add i32 [[TMP_C_PHI_US]], [[TMP_C_US]]
+; CHECK-NEXT:    br label [[LATCH_US:%.*]]
+; CHECK:       latch.us:
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp slt i32 [[TMP_C_SUM_US]], 42
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK:       exit.split.us:
+; CHECK-NEXT:    [[LCSSA_PHI_US:%.*]] = phi i32 [ [[TMP_C_SUM_US]], [[LATCH_US]] ]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       entry.split.us1:
+; CHECK-NEXT:    br label [[HEADER_US2:%.*]]
+; CHECK:       header.us2:
+; CHECK-NEXT:    [[TMP_US3:%.*]] = call i32 @d()
+; CHECK-NEXT:    [[CMP1_US4:%.*]] = icmp eq i32 [[TMP_US3]], 0
+; CHECK-NEXT:    br i1 [[CMP1_US4]], label [[BODY_A_US6:%.*]], label [[DISPATCH_US5:%.*]]
+; CHECK:       dispatch.us5:
+; CHECK-NEXT:    br label [[BODY_B_US10:%.*]]
+; CHECK:       body.a.us6:
+; CHECK-NEXT:    [[TMP_A_PHI_US7:%.*]] = phi i32 [ 0, [[HEADER_US2]] ]
+; CHECK-NEXT:    [[TMP_A_US8:%.*]] = call i32 @a()
+; CHECK-NEXT:    [[TMP_A_SUM_US9:%.*]] = add i32 [[TMP_A_PHI_US7]], [[TMP_A_US8]]
+; CHECK-NEXT:    br label [[BODY_B_US10]]
+; CHECK:       body.b.us10:
+; CHECK-NEXT:    [[TMP_B_PHI_US11:%.*]] = phi i32 [ [[TMP_US3]], [[DISPATCH_US5]] ], [ [[TMP_A_SUM_US9]], [[BODY_A_US6]] ]
+; CHECK-NEXT:    [[TMP_B_US12:%.*]] = call i32 @b()
+; CHECK-NEXT:    [[TMP_B_SUM_US13:%.*]] = add i32 [[TMP_B_PHI_US11]], [[TMP_B_US12]]
+; CHECK-NEXT:    br label [[BODY_C_US14:%.*]]
+; CHECK:       body.c.us14:
+; CHECK-NEXT:    [[TMP_C_PHI_US15:%.*]] = phi i32 [ [[TMP_B_SUM_US13]], [[BODY_B_US10]] ]
+; CHECK-NEXT:    [[TMP_C_US16:%.*]] = call i32 @c()
+; CHECK-NEXT:    [[TMP_C_SUM_US17:%.*]] = add i32 [[TMP_C_PHI_US15]], [[TMP_C_US16]]
+; CHECK-NEXT:    br label [[LATCH_US18:%.*]]
+; CHECK:       latch.us18:
+; CHECK-NEXT:    [[CMP2_US19:%.*]] = icmp slt i32 [[TMP_C_SUM_US17]], 42
+; CHECK-NEXT:    br i1 [[CMP2_US19]], label [[HEADER_US2]], label [[EXIT_SPLIT_SPLIT_US:%.*]]
+; CHECK:       exit.split.split.us:
+; CHECK-NEXT:    [[LCSSA_PHI_US20:%.*]] = phi i32 [ [[TMP_C_SUM_US17]], [[LATCH_US18]] ]
+; CHECK-NEXT:    br label [[EXIT_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[TMP:%.*]] = call i32 @d()
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[BODY_A:%.*]], label [[DISPATCH:%.*]]
+; CHECK:       dispatch:
+; CHECK-NEXT:    br label [[BODY_C:%.*]]
+; CHECK:       body.a:
+; CHECK-NEXT:    [[TMP_A_PHI:%.*]] = phi i32 [ 0, [[HEADER]] ]
+; CHECK-NEXT:    [[TMP_A:%.*]] = call i32 @a()
+; CHECK-NEXT:    [[TMP_A_SUM:%.*]] = add i32 [[TMP_A_PHI]], [[TMP_A]]
+; CHECK-NEXT:    br label [[BODY_B:%.*]]
+; CHECK:       body.b:
+; CHECK-NEXT:    [[TMP_B_PHI:%.*]] = phi i32 [ [[TMP_A_SUM]], [[BODY_A]] ]
+; CHECK-NEXT:    [[TMP_B:%.*]] = call i32 @b()
+; CHECK-NEXT:    [[TMP_B_SUM:%.*]] = add i32 [[TMP_B_PHI]], [[TMP_B]]
+; CHECK-NEXT:    br label [[BODY_C]]
+; CHECK:       body.c:
+; CHECK-NEXT:    [[TMP_C_PHI:%.*]] = phi i32 [ [[TMP]], [[DISPATCH]] ], [ [[TMP_B_SUM]], [[BODY_B]] ]
+; CHECK-NEXT:    [[TMP_C:%.*]] = call i32 @c()
+; CHECK-NEXT:    [[TMP_C_SUM:%.*]] = add i32 [[TMP_C_PHI]], [[TMP_C]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP_C_SUM]], 42
+; CHECK-NEXT:    br i1 [[CMP2]], label [[HEADER]], label [[EXIT_SPLIT_SPLIT:%.*]]
+; CHECK:       exit.split.split:
+; CHECK-NEXT:    [[LCSSA_PHI:%.*]] = phi i32 [ [[TMP_C_SUM]], [[LATCH]] ]
+; CHECK-NEXT:    br label [[EXIT_SPLIT]]
+; CHECK:       exit.split:
+; CHECK-NEXT:    [[DOTUS_PHI21:%.*]] = phi i32 [ [[LCSSA_PHI]], [[EXIT_SPLIT_SPLIT]] ], [ [[LCSSA_PHI_US20]], [[EXIT_SPLIT_SPLIT_US]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[DOTUS_PHI:%.*]] = phi i32 [ [[DOTUS_PHI21]], [[EXIT_SPLIT]] ], [ [[LCSSA_PHI_US]], [[EXIT_SPLIT_US]] ]
+; CHECK-NEXT:    ret i32 [[DOTUS_PHI]]
+;
+entry:
+  br label %header
+
+header:
+  %tmp = call i32 @d()
+  %cmp1 = icmp eq i32 %tmp, 0
+  ; We set up a chain through all the successors of the switch that doesn't
+  ; involve the switch so that we can have interesting PHI nodes in them.
+  br i1 %cmp1, label %body.a, label %dispatch
+
+dispatch:
+  ; Switch with multiple successors. We arrange the last successor to be the
+  ; default to make the test case easier to read. This has a duplicate edge
+  ; both to the default destination (which is completely superfluous but
+  ; technically valid IR) and to a regular successor.
+  switch i32 %arg, label %body.c [
+  i32 0, label %body.a
+  i32 1, label %body.a
+  i32 2, label %body.b
+  i32 3, label %body.c
+  ]
+
+body.a:
+  %tmp.a.phi = phi i32 [ 0, %header ], [ %tmp, %dispatch ], [ %tmp, %dispatch ]
+  %tmp.a = call i32 @a()
+  %tmp.a.sum = add i32 %tmp.a.phi, %tmp.a
+  br label %body.b
+; Unswitched 'a' loop.
+
+body.b:
+  %tmp.b.phi = phi i32 [ %tmp, %dispatch ], [ %tmp.a.sum, %body.a ]
+  %tmp.b = call i32 @b()
+  %tmp.b.sum = add i32 %tmp.b.phi, %tmp.b
+  br label %body.c
+; Unswitched 'b' loop.
+
+body.c:
+  %tmp.c.phi = phi i32 [ %tmp, %dispatch ], [ %tmp, %dispatch ], [ %tmp.b.sum, %body.b ]
+  %tmp.c = call i32 @c()
+  %tmp.c.sum = add i32 %tmp.c.phi, %tmp.c
+  br label %latch
+; Unswitched 'c' loop.
+
+latch:
+  %cmp2 = icmp slt i32 %tmp.c.sum, 42
+  br i1 %cmp2, label %header, label %exit
+
+exit:
+  %lcssa.phi = phi i32 [ %tmp.c.sum, %latch ]
+  ret i32 %lcssa.phi
+
+}
+
+; Similar to @test29 but designed to have one of the duplicate edges be
+; a loop exit edge as those can in some cases be special. Among other things,
+; this includes an LCSSA phi with multiple entries despite being a dedicated
+; exit block.
+define i32 @test30(i32 %arg) {
+; CHECK-LABEL: @test30(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARG_FR:%.*]] = freeze i32 [[ARG:%.*]]
+; CHECK-NEXT:    switch i32 [[ARG_FR]], label [[ENTRY_SPLIT:%.*]] [
+; CHECK-NEXT:    i32 -1, label [[ENTRY_SPLIT]]
+; CHECK-NEXT:    i32 0, label [[ENTRY_SPLIT_US:%.*]]
+; CHECK-NEXT:    i32 1, label [[ENTRY_SPLIT_US1:%.*]]
+; CHECK-NEXT:    i32 2, label [[ENTRY_SPLIT_US1]]
+; CHECK-NEXT:    ]
+; CHECK:       entry.split.us:
+; CHECK-NEXT:    br label [[HEADER_US:%.*]]
+; CHECK:       header.us:
+; CHECK-NEXT:    [[TMP_US:%.*]] = call i32 @d()
+; CHECK-NEXT:    [[CMP1_US:%.*]] = icmp eq i32 [[TMP_US]], 0
+; CHECK-NEXT:    br i1 [[CMP1_US]], label [[BODY_A_US:%.*]], label [[DISPATCH_US:%.*]]
+; CHECK:       dispatch.us:
+; CHECK-NEXT:    br label [[BODY_A_US]]
+; CHECK:       body.a.us:
+; CHECK-NEXT:    [[TMP_A_PHI_US:%.*]] = phi i32 [ 0, [[HEADER_US]] ], [ [[TMP_US]], [[DISPATCH_US]] ]
+; CHECK-NEXT:    [[TMP_A_US:%.*]] = call i32 @a()
+; CHECK-NEXT:    [[TMP_A_SUM_US:%.*]] = add i32 [[TMP_A_PHI_US]], [[TMP_A_US]]
+; CHECK-NEXT:    br label [[BODY_B_US:%.*]]
+; CHECK:       body.b.us:
+; CHECK-NEXT:    [[TMP_B_PHI_US:%.*]] = phi i32 [ [[TMP_A_SUM_US]], [[BODY_A_US]] ]
+; CHECK-NEXT:    [[TMP_B_US:%.*]] = call i32 @b()
+; CHECK-NEXT:    [[TMP_B_SUM_US:%.*]] = add i32 [[TMP_B_PHI_US]], [[TMP_B_US]]
+; CHECK-NEXT:    br label [[LATCH_US:%.*]]
+; CHECK:       latch.us:
+; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp slt i32 [[TMP_B_SUM_US]], 42
+; CHECK-NEXT:    br i1 [[CMP2_US]], label [[HEADER_US]], label [[LOOP_EXIT2_SPLIT_US:%.*]]
+; CHECK:       loop.exit2.split.us:
+; CHECK-NEXT:    [[L2_PHI_US:%.*]] = phi i32 [ [[TMP_B_SUM_US]], [[LATCH_US]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT2:%.*]]
+; CHECK:       entry.split.us1:
+; CHECK-NEXT:    br label [[HEADER_US2:%.*]]
+; CHECK:       header.us2:
+; CHECK-NEXT:    [[TMP_US3:%.*]] = call i32 @d()
+; CHECK-NEXT:    [[CMP1_US4:%.*]] = icmp eq i32 [[TMP_US3]], 0
+; CHECK-NEXT:    br i1 [[CMP1_US4]], label [[BODY_A_US6:%.*]], label [[DISPATCH_US5:%.*]]
+; CHECK:       dispatch.us5:
+; CHECK-NEXT:    br label [[BODY_B_US10:%.*]]
+; CHECK:       body.a.us6:
+; CHECK-NEXT:    [[TMP_A_PHI_US7:%.*]] = phi i32 [ 0, [[HEADER_US2]] ]
+; CHECK-NEXT:    [[TMP_A_US8:%.*]] = call i32 @a()
+; CHECK-NEXT:    [[TMP_A_SUM_US9:%.*]] = add i32 [[TMP_A_PHI_US7]], [[TMP_A_US8]]
+; CHECK-NEXT:    br label [[BODY_B_US10]]
+; CHECK:       body.b.us10:
+; CHECK-NEXT:    [[TMP_B_PHI_US11:%.*]] = phi i32 [ [[TMP_US3]], [[DISPATCH_US5]] ], [ [[TMP_A_SUM_US9]], [[BODY_A_US6]] ]
+; CHECK-NEXT:    [[TMP_B_US12:%.*]] = call i32 @b()
+; CHECK-NEXT:    [[TMP_B_SUM_US13:%.*]] = add i32 [[TMP_B_PHI_US11]], [[TMP_B_US12]]
+; CHECK-NEXT:    br label [[LATCH_US14:%.*]]
+; CHECK:       latch.us14:
+; CHECK-NEXT:    [[CMP2_US15:%.*]] = icmp slt i32 [[TMP_B_SUM_US13]], 42
+; CHECK-NEXT:    br i1 [[CMP2_US15]], label [[HEADER_US2]], label [[LOOP_EXIT2_SPLIT_SPLIT_US:%.*]]
+; CHECK:       loop.exit2.split.split.us:
+; CHECK-NEXT:    [[L2_PHI_US16:%.*]] = phi i32 [ [[TMP_B_SUM_US13]], [[LATCH_US14]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT2_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[TMP:%.*]] = call i32 @d()
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[TMP]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[BODY_A:%.*]], label [[DISPATCH:%.*]]
+; CHECK:       dispatch:
+; CHECK-NEXT:    [[TMP_LCSSA:%.*]] = phi i32 [ [[TMP]], [[HEADER]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT1:%.*]]
+; CHECK:       body.a:
+; CHECK-NEXT:    [[TMP_A_PHI:%.*]] = phi i32 [ 0, [[HEADER]] ]
+; CHECK-NEXT:    [[TMP_A:%.*]] = call i32 @a()
+; CHECK-NEXT:    [[TMP_A_SUM:%.*]] = add i32 [[TMP_A_PHI]], [[TMP_A]]
+; CHECK-NEXT:    br label [[BODY_B:%.*]]
+; CHECK:       body.b:
+; CHECK-NEXT:    [[TMP_B_PHI:%.*]] = phi i32 [ [[TMP_A_SUM]], [[BODY_A]] ]
+; CHECK-NEXT:    [[TMP_B:%.*]] = call i32 @b()
+; CHECK-NEXT:    [[TMP_B_SUM:%.*]] = add i32 [[TMP_B_PHI]], [[TMP_B]]
+; CHECK-NEXT:    br label [[LATCH:%.*]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP_B_SUM]], 42
+; CHECK-NEXT:    br i1 [[CMP2]], label [[HEADER]], label [[LOOP_EXIT2_SPLIT_SPLIT:%.*]]
+; CHECK:       loop.exit1:
+; CHECK-NEXT:    [[L1_PHI:%.*]] = phi i32 [ [[TMP_LCSSA]], [[DISPATCH]] ]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       loop.exit2.split.split:
+; CHECK-NEXT:    [[L2_PHI:%.*]] = phi i32 [ [[TMP_B_SUM]], [[LATCH]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT2_SPLIT]]
+; CHECK:       loop.exit2.split:
+; CHECK-NEXT:    [[DOTUS_PHI17:%.*]] = phi i32 [ [[L2_PHI]], [[LOOP_EXIT2_SPLIT_SPLIT]] ], [ [[L2_PHI_US16]], [[LOOP_EXIT2_SPLIT_SPLIT_US]] ]
+; CHECK-NEXT:    br label [[LOOP_EXIT2]]
+; CHECK:       loop.exit2:
+; CHECK-NEXT:    [[DOTUS_PHI:%.*]] = phi i32 [ [[DOTUS_PHI17]], [[LOOP_EXIT2_SPLIT]] ], [ [[L2_PHI_US]], [[LOOP_EXIT2_SPLIT_US]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[L_PHI:%.*]] = phi i32 [ [[L1_PHI]], [[LOOP_EXIT1]] ], [ [[DOTUS_PHI]], [[LOOP_EXIT2]] ]
+; CHECK-NEXT:    ret i32 [[L_PHI]]
+;
+entry:
+  br label %header
+
+header:
+  %tmp = call i32 @d()
+  %cmp1 = icmp eq i32 %tmp, 0
+  br i1 %cmp1, label %body.a, label %dispatch
+
+dispatch:
+  switch i32 %arg, label %loop.exit1 [
+  i32 -1, label %loop.exit1
+  i32 0, label %body.a
+  i32 1, label %body.b
+  i32 2, label %body.b
+  ]
+
+body.a:
+  %tmp.a.phi = phi i32 [ 0, %header ], [ %tmp, %dispatch ]
+  %tmp.a = call i32 @a()
+  %tmp.a.sum = add i32 %tmp.a.phi, %tmp.a
+  br label %body.b
+; Unswitched 'a' loop.
+
+body.b:
+  %tmp.b.phi = phi i32 [ %tmp, %dispatch ], [ %tmp, %dispatch ], [ %tmp.a.sum, %body.a ]
+  %tmp.b = call i32 @b()
+  %tmp.b.sum = add i32 %tmp.b.phi, %tmp.b
+  br label %latch
+; Unswitched 'b' loop.
+
+latch:
+  %cmp2 = icmp slt i32 %tmp.b.sum, 42
+  br i1 %cmp2, label %header, label %loop.exit2
+
+loop.exit1:
+  %l1.phi = phi i32 [ %tmp, %dispatch ], [ %tmp, %dispatch ]
+  br label %exit
+; Unswitched 'exit' loop.
+
+loop.exit2:
+  %l2.phi = phi i32 [ %tmp.b.sum, %latch ]
+  br label %exit
+
+exit:
+  %l.phi = phi i32 [ %l1.phi, %loop.exit1 ], [ %l2.phi, %loop.exit2 ]
+  ret i32 %l.phi
+}
+
+; Unswitch will not actually change the loop nest from:
+;   A < B < C
+define void @hoist_inner_loop0() {
+; CHECK-LABEL: @hoist_inner_loop0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[A_HEADER:%.*]]
+; CHECK:       a.header:
+; CHECK-NEXT:    br label [[B_HEADER:%.*]]
+; CHECK:       b.header:
+; CHECK-NEXT:    [[V1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    [[V1_FR:%.*]] = freeze i1 [[V1]]
+; CHECK-NEXT:    br i1 [[V1_FR]], label [[B_HEADER_SPLIT_US:%.*]], label [[B_HEADER_SPLIT:%.*]]
+; CHECK:       b.header.split.us:
+; CHECK-NEXT:    br label [[C_HEADER_US:%.*]]
+; CHECK:       c.header.us:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @c()
+; CHECK-NEXT:    br label [[B_LATCH_SPLIT_US:%.*]]
+; CHECK:       b.latch.split.us:
+; CHECK-NEXT:    br label [[B_LATCH:%.*]]
+; CHECK:       b.header.split:
+; CHECK-NEXT:    br label [[C_HEADER:%.*]]
+; CHECK:       c.header:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @c()
+; CHECK-NEXT:    br label [[C_LATCH:%.*]]
+; CHECK:       c.latch:
+; CHECK-NEXT:    [[V2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V2]], label [[C_HEADER]], label [[B_LATCH_SPLIT:%.*]]
+; CHECK:       b.latch.split:
+; CHECK-NEXT:    br label [[B_LATCH]]
+; CHECK:       b.latch:
+; CHECK-NEXT:    [[V3:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V3]], label [[B_HEADER]], label [[A_LATCH:%.*]]
+; CHECK:       a.latch:
+; CHECK-NEXT:    br label [[A_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %a.header
+
+a.header:
+  br label %b.header
+
+b.header:
+  %v1 = call i1 @cond()
+  br label %c.header
+
+c.header:
+  call i32 @c()
+  br i1 %v1, label %b.latch, label %c.latch
+
+c.latch:
+  %v2 = call i1 @cond()
+  br i1 %v2, label %c.header, label %b.latch
+
+b.latch:
+  %v3 = call i1 @cond()
+  br i1 %v3, label %b.header, label %a.latch
+
+a.latch:
+  br label %a.header
+
+exit:
+  ret void
+}
+
+; Unswitch will transform the loop nest from:
+;   A < B < C
+; into
+;   A < (B, C)
+define void @hoist_inner_loop1(i32* %ptr) {
+; CHECK-LABEL: @hoist_inner_loop1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[A_HEADER:%.*]]
+; CHECK:       a.header:
+; CHECK-NEXT:    [[X_A:%.*]] = load i32, i32* [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[B_HEADER:%.*]]
+; CHECK:       b.header:
+; CHECK-NEXT:    [[X_B:%.*]] = load i32, i32* [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    [[V1_FR:%.*]] = freeze i1 [[V1]]
+; CHECK-NEXT:    br i1 [[V1_FR]], label [[B_HEADER_SPLIT_US:%.*]], label [[B_HEADER_SPLIT:%.*]]
+; CHECK:       b.header.split.us:
+; CHECK-NEXT:    br label [[C_HEADER_US:%.*]]
+; CHECK:       c.header.us:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @c()
+; CHECK-NEXT:    br label [[B_LATCH_SPLIT_US:%.*]]
+; CHECK:       b.latch.split.us:
+; CHECK-NEXT:    br label [[B_LATCH:%.*]]
+; CHECK:       b.header.split:
+; CHECK-NEXT:    [[X_B_LCSSA:%.*]] = phi i32 [ [[X_B]], [[B_HEADER]] ]
+; CHECK-NEXT:    br label [[C_HEADER:%.*]]
+; CHECK:       c.header:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @c()
+; CHECK-NEXT:    br label [[C_LATCH:%.*]]
+; CHECK:       c.latch:
+; CHECK-NEXT:    store i32 [[X_A]], i32* [[PTR]], align 4
+; CHECK-NEXT:    store i32 [[X_B_LCSSA]], i32* [[PTR]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V2]], label [[C_HEADER]], label [[A_EXIT_C:%.*]]
+; CHECK:       b.latch:
+; CHECK-NEXT:    [[V3:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V3]], label [[B_HEADER]], label [[A_EXIT_B:%.*]]
+; CHECK:       a.exit.c:
+; CHECK-NEXT:    br label [[A_LATCH:%.*]]
+; CHECK:       a.exit.b:
+; CHECK-NEXT:    br label [[A_LATCH]]
+; CHECK:       a.latch:
+; CHECK-NEXT:    br label [[A_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %a.header
+
+a.header:
+  %x.a = load i32, i32* %ptr
+  br label %b.header
+
+b.header:
+  %x.b = load i32, i32* %ptr
+  %v1 = call i1 @cond()
+  br label %c.header
+
+c.header:
+  call i32 @c()
+  br i1 %v1, label %b.latch, label %c.latch
+
+c.latch:
+  ; Use values from other loops to check LCSSA form.
+  store i32 %x.a, i32* %ptr
+  store i32 %x.b, i32* %ptr
+  %v2 = call i1 @cond()
+  br i1 %v2, label %c.header, label %a.exit.c
+
+b.latch:
+  %v3 = call i1 @cond()
+  br i1 %v3, label %b.header, label %a.exit.b
+
+a.exit.c:
+  br label %a.latch
+
+a.exit.b:
+  br label %a.latch
+
+a.latch:
+  br label %a.header
+
+exit:
+  ret void
+}
+
+; Unswitch will transform the loop nest from:
+;   A < B < C
+; into
+;   (A < B), C
+define void @hoist_inner_loop2(i32* %ptr) {
+; CHECK-LABEL: @hoist_inner_loop2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[A_HEADER:%.*]]
+; CHECK:       a.header:
+; CHECK-NEXT:    [[X_A:%.*]] = load i32, i32* [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[B_HEADER:%.*]]
+; CHECK:       b.header:
+; CHECK-NEXT:    [[X_B:%.*]] = load i32, i32* [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    [[V1_FR:%.*]] = freeze i1 [[V1]]
+; CHECK-NEXT:    br i1 [[V1_FR]], label [[B_HEADER_SPLIT_US:%.*]], label [[B_HEADER_SPLIT:%.*]]
+; CHECK:       b.header.split.us:
+; CHECK-NEXT:    br label [[C_HEADER_US:%.*]]
+; CHECK:       c.header.us:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @c()
+; CHECK-NEXT:    br label [[B_LATCH_SPLIT_US:%.*]]
+; CHECK:       b.latch.split.us:
+; CHECK-NEXT:    br label [[B_LATCH:%.*]]
+; CHECK:       b.header.split:
+; CHECK-NEXT:    [[X_A_LCSSA:%.*]] = phi i32 [ [[X_A]], [[B_HEADER]] ]
+; CHECK-NEXT:    [[X_B_LCSSA:%.*]] = phi i32 [ [[X_B]], [[B_HEADER]] ]
+; CHECK-NEXT:    br label [[C_HEADER:%.*]]
+; CHECK:       c.header:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @c()
+; CHECK-NEXT:    br label [[C_LATCH:%.*]]
+; CHECK:       c.latch:
+; CHECK-NEXT:    store i32 [[X_A_LCSSA]], i32* [[PTR]], align 4
+; CHECK-NEXT:    store i32 [[X_B_LCSSA]], i32* [[PTR]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V2]], label [[C_HEADER]], label [[EXIT:%.*]]
+; CHECK:       b.latch:
+; CHECK-NEXT:    [[V3:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V3]], label [[B_HEADER]], label [[A_LATCH:%.*]]
+; CHECK:       a.latch:
+; CHECK-NEXT:    br label [[A_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %a.header
+
+a.header:
+  %x.a = load i32, i32* %ptr
+  br label %b.header
+
+b.header:
+  %x.b = load i32, i32* %ptr
+  %v1 = call i1 @cond()
+  br label %c.header
+
+c.header:
+  call i32 @c()
+  br i1 %v1, label %b.latch, label %c.latch
+
+c.latch:
+  ; Use values from other loops to check LCSSA form.
+  store i32 %x.a, i32* %ptr
+  store i32 %x.b, i32* %ptr
+  %v2 = call i1 @cond()
+  br i1 %v2, label %c.header, label %exit
+
+b.latch:
+  %v3 = call i1 @cond()
+  br i1 %v3, label %b.header, label %a.latch
+
+a.latch:
+  br label %a.header
+
+exit:
+  ret void
+}
+
+; Same as @hoist_inner_loop2 but with a nested loop inside the hoisted loop.
+; Unswitch will transform the loop nest from:
+;   A < B < C < D
+; into
+;   (A < B), (C < D)
+define void @hoist_inner_loop3(i32* %ptr) {
+; CHECK-LABEL: @hoist_inner_loop3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[A_HEADER:%.*]]
+; CHECK:       a.header:
+; CHECK-NEXT:    [[X_A:%.*]] = load i32, i32* [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[B_HEADER:%.*]]
+; CHECK:       b.header:
+; CHECK-NEXT:    [[X_B:%.*]] = load i32, i32* [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    [[V1_FR:%.*]] = freeze i1 [[V1]]
+; CHECK-NEXT:    br i1 [[V1_FR]], label [[B_HEADER_SPLIT_US:%.*]], label [[B_HEADER_SPLIT:%.*]]
+; CHECK:       b.header.split.us:
+; CHECK-NEXT:    br label [[C_HEADER_US:%.*]]
+; CHECK:       c.header.us:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @c()
+; CHECK-NEXT:    br label [[B_LATCH_SPLIT_US:%.*]]
+; CHECK:       b.latch.split.us:
+; CHECK-NEXT:    br label [[B_LATCH:%.*]]
+; CHECK:       b.header.split:
+; CHECK-NEXT:    [[X_A_LCSSA:%.*]] = phi i32 [ [[X_A]], [[B_HEADER]] ]
+; CHECK-NEXT:    [[X_B_LCSSA:%.*]] = phi i32 [ [[X_B]], [[B_HEADER]] ]
+; CHECK-NEXT:    br label [[C_HEADER:%.*]]
+; CHECK:       c.header:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @c()
+; CHECK-NEXT:    br label [[C_BODY:%.*]]
+; CHECK:       c.body:
+; CHECK-NEXT:    [[X_C:%.*]] = load i32, i32* [[PTR]], align 4
+; CHECK-NEXT:    br label [[D_HEADER:%.*]]
+; CHECK:       d.header:
+; CHECK-NEXT:    store i32 [[X_A_LCSSA]], i32* [[PTR]], align 4
+; CHECK-NEXT:    store i32 [[X_B_LCSSA]], i32* [[PTR]], align 4
+; CHECK-NEXT:    store i32 [[X_C]], i32* [[PTR]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V2]], label [[D_HEADER]], label [[C_LATCH:%.*]]
+; CHECK:       c.latch:
+; CHECK-NEXT:    [[V3:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V3]], label [[C_HEADER]], label [[EXIT:%.*]]
+; CHECK:       b.latch:
+; CHECK-NEXT:    [[V4:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V4]], label [[B_HEADER]], label [[A_LATCH:%.*]]
+; CHECK:       a.latch:
+; CHECK-NEXT:    br label [[A_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %a.header
+
+a.header:
+  %x.a = load i32, i32* %ptr
+  br label %b.header
+
+b.header:
+  %x.b = load i32, i32* %ptr
+  %v1 = call i1 @cond()
+  br label %c.header
+
+c.header:
+  call i32 @c()
+  br i1 %v1, label %b.latch, label %c.body
+
+c.body:
+  %x.c = load i32, i32* %ptr
+  br label %d.header
+
+d.header:
+  ; Use values from other loops to check LCSSA form.
+  store i32 %x.a, i32* %ptr
+  store i32 %x.b, i32* %ptr
+  store i32 %x.c, i32* %ptr
+  %v2 = call i1 @cond()
+  br i1 %v2, label %d.header, label %c.latch
+
+c.latch:
+  %v3 = call i1 @cond()
+  br i1 %v3, label %c.header, label %exit
+
+b.latch:
+  %v4 = call i1 @cond()
+  br i1 %v4, label %b.header, label %a.latch
+
+a.latch:
+  br label %a.header
+
+exit:
+  ret void
+}
+
+; This test is designed to exercise checking multiple remaining exits from the
+; loop being unswitched.
+; Unswitch will transform the loop nest from:
+;   A < B < C < D
+; into
+;   A < B < (C, D)
+define void @hoist_inner_loop4() {
+; CHECK-LABEL: @hoist_inner_loop4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[A_HEADER:%.*]]
+; CHECK:       a.header:
+; CHECK-NEXT:    br label [[B_HEADER:%.*]]
+; CHECK:       b.header:
+; CHECK-NEXT:    br label [[C_HEADER:%.*]]
+; CHECK:       c.header:
+; CHECK-NEXT:    [[V1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    [[V1_FR:%.*]] = freeze i1 [[V1]]
+; CHECK-NEXT:    br i1 [[V1_FR]], label [[C_HEADER_SPLIT_US:%.*]], label [[C_HEADER_SPLIT:%.*]]
+; CHECK:       c.header.split.us:
+; CHECK-NEXT:    br label [[D_HEADER_US:%.*]]
+; CHECK:       d.header.us:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @d()
+; CHECK-NEXT:    br label [[C_LATCH_SPLIT_US:%.*]]
+; CHECK:       c.latch.split.us:
+; CHECK-NEXT:    br label [[C_LATCH:%.*]]
+; CHECK:       c.header.split:
+; CHECK-NEXT:    br label [[D_HEADER:%.*]]
+; CHECK:       d.header:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @d()
+; CHECK-NEXT:    br label [[D_EXITING1:%.*]]
+; CHECK:       d.exiting1:
+; CHECK-NEXT:    [[V2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V2]], label [[D_EXITING2:%.*]], label [[A_LATCH:%.*]]
+; CHECK:       d.exiting2:
+; CHECK-NEXT:    [[V3:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V3]], label [[D_EXITING3:%.*]], label [[LOOPEXIT_D:%.*]]
+; CHECK:       d.exiting3:
+; CHECK-NEXT:    [[V4:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V4]], label [[D_LATCH:%.*]], label [[B_LATCH:%.*]]
+; CHECK:       d.latch:
+; CHECK-NEXT:    br label [[D_HEADER]]
+; CHECK:       c.latch:
+; CHECK-NEXT:    [[V5:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V5]], label [[C_HEADER]], label [[LOOPEXIT_C:%.*]]
+; CHECK:       b.latch:
+; CHECK-NEXT:    br label [[B_HEADER]]
+; CHECK:       a.latch:
+; CHECK-NEXT:    br label [[A_HEADER]]
+; CHECK:       loopexit.d:
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       loopexit.c:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %a.header
+
+a.header:
+  br label %b.header
+
+b.header:
+  br label %c.header
+
+c.header:
+  %v1 = call i1 @cond()
+  br label %d.header
+
+d.header:
+  call i32 @d()
+  br i1 %v1, label %c.latch, label %d.exiting1
+
+d.exiting1:
+  %v2 = call i1 @cond()
+  br i1 %v2, label %d.exiting2, label %a.latch
+
+d.exiting2:
+  %v3 = call i1 @cond()
+  br i1 %v3, label %d.exiting3, label %loopexit.d
+
+d.exiting3:
+  %v4 = call i1 @cond()
+  br i1 %v4, label %d.latch, label %b.latch
+
+d.latch:
+  br label %d.header
+
+c.latch:
+  %v5 = call i1 @cond()
+  br i1 %v5, label %c.header, label %loopexit.c
+
+b.latch:
+  br label %b.header
+
+a.latch:
+  br label %a.header
+
+loopexit.d:
+  br label %exit
+
+loopexit.c:
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Unswitch will transform the loop nest from:
+;   A < B < C < D
+; into
+;   A < ((B < C), D)
+define void @hoist_inner_loop5(i32* %ptr) {
+; CHECK-LABEL: @hoist_inner_loop5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[A_HEADER:%.*]]
+; CHECK:       a.header:
+; CHECK-NEXT:    [[X_A:%.*]] = load i32, i32* [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[B_HEADER:%.*]]
+; CHECK:       b.header:
+; CHECK-NEXT:    [[X_B:%.*]] = load i32, i32* [[PTR]], align 4
+; CHECK-NEXT:    br label [[C_HEADER:%.*]]
+; CHECK:       c.header:
+; CHECK-NEXT:    [[X_C:%.*]] = load i32, i32* [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    [[V1_FR:%.*]] = freeze i1 [[V1]]
+; CHECK-NEXT:    br i1 [[V1_FR]], label [[C_HEADER_SPLIT_US:%.*]], label [[C_HEADER_SPLIT:%.*]]
+; CHECK:       c.header.split.us:
+; CHECK-NEXT:    br label [[D_HEADER_US:%.*]]
+; CHECK:       d.header.us:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @d()
+; CHECK-NEXT:    br label [[C_LATCH_SPLIT_US:%.*]]
+; CHECK:       c.latch.split.us:
+; CHECK-NEXT:    br label [[C_LATCH:%.*]]
+; CHECK:       c.header.split:
+; CHECK-NEXT:    [[X_B_LCSSA:%.*]] = phi i32 [ [[X_B]], [[C_HEADER]] ]
+; CHECK-NEXT:    [[X_C_LCSSA:%.*]] = phi i32 [ [[X_C]], [[C_HEADER]] ]
+; CHECK-NEXT:    br label [[D_HEADER:%.*]]
+; CHECK:       d.header:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @d()
+; CHECK-NEXT:    br label [[D_LATCH:%.*]]
+; CHECK:       d.latch:
+; CHECK-NEXT:    store i32 [[X_A]], i32* [[PTR]], align 4
+; CHECK-NEXT:    store i32 [[X_B_LCSSA]], i32* [[PTR]], align 4
+; CHECK-NEXT:    store i32 [[X_C_LCSSA]], i32* [[PTR]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V2]], label [[D_HEADER]], label [[A_LATCH:%.*]]
+; CHECK:       c.latch:
+; CHECK-NEXT:    [[V3:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V3]], label [[C_HEADER]], label [[B_LATCH:%.*]]
+; CHECK:       b.latch:
+; CHECK-NEXT:    br label [[B_HEADER]]
+; CHECK:       a.latch:
+; CHECK-NEXT:    br label [[A_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %a.header
+
+a.header:
+  %x.a = load i32, i32* %ptr
+  br label %b.header
+
+b.header:
+  %x.b = load i32, i32* %ptr
+  br label %c.header
+
+c.header:
+  %x.c = load i32, i32* %ptr
+  %v1 = call i1 @cond()
+  br label %d.header
+
+d.header:
+  call i32 @d()
+  br i1 %v1, label %c.latch, label %d.latch
+
+d.latch:
+  ; Use values from other loops to check LCSSA form.
+  store i32 %x.a, i32* %ptr
+  store i32 %x.b, i32* %ptr
+  store i32 %x.c, i32* %ptr
+  %v2 = call i1 @cond()
+  br i1 %v2, label %d.header, label %a.latch
+
+c.latch:
+  %v3 = call i1 @cond()
+  br i1 %v3, label %c.header, label %b.latch
+
+b.latch:
+  br label %b.header
+
+a.latch:
+  br label %a.header
+
+exit:
+  ret void
+}
+
+define void @hoist_inner_loop_switch(i32* %ptr) {
+; CHECK-LABEL: @hoist_inner_loop_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[A_HEADER:%.*]]
+; CHECK:       a.header:
+; CHECK-NEXT:    [[X_A:%.*]] = load i32, i32* [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[B_HEADER:%.*]]
+; CHECK:       b.header:
+; CHECK-NEXT:    [[X_B:%.*]] = load i32, i32* [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = call i32 @cond.i32()
+; CHECK-NEXT:    [[V1_FR:%.*]] = freeze i32 [[V1]]
+; CHECK-NEXT:    switch i32 [[V1_FR]], label [[B_HEADER_SPLIT:%.*]] [
+; CHECK-NEXT:    i32 1, label [[B_HEADER_SPLIT_US:%.*]]
+; CHECK-NEXT:    i32 2, label [[B_HEADER_SPLIT_US]]
+; CHECK-NEXT:    i32 3, label [[B_HEADER_SPLIT_US]]
+; CHECK-NEXT:    ]
+; CHECK:       b.header.split.us:
+; CHECK-NEXT:    br label [[C_HEADER_US:%.*]]
+; CHECK:       c.header.us:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @c()
+; CHECK-NEXT:    br label [[B_LATCH_SPLIT_US:%.*]]
+; CHECK:       b.latch.split.us:
+; CHECK-NEXT:    br label [[B_LATCH:%.*]]
+; CHECK:       b.header.split:
+; CHECK-NEXT:    [[X_A_LCSSA:%.*]] = phi i32 [ [[X_A]], [[B_HEADER]] ]
+; CHECK-NEXT:    [[X_B_LCSSA:%.*]] = phi i32 [ [[X_B]], [[B_HEADER]] ]
+; CHECK-NEXT:    br label [[C_HEADER:%.*]]
+; CHECK:       c.header:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @c()
+; CHECK-NEXT:    br label [[C_LATCH:%.*]]
+; CHECK:       c.latch:
+; CHECK-NEXT:    store i32 [[X_A_LCSSA]], i32* [[PTR]], align 4
+; CHECK-NEXT:    store i32 [[X_B_LCSSA]], i32* [[PTR]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V2]], label [[C_HEADER]], label [[EXIT:%.*]]
+; CHECK:       b.latch:
+; CHECK-NEXT:    [[V3:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[V3]], label [[B_HEADER]], label [[A_LATCH:%.*]]
+; CHECK:       a.latch:
+; CHECK-NEXT:    br label [[A_HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %a.header
+
+a.header:
+  %x.a = load i32, i32* %ptr
+  br label %b.header
+
+b.header:
+  %x.b = load i32, i32* %ptr
+  %v1 = call i32 @cond.i32()
+  br label %c.header
+
+c.header:
+  call i32 @c()
+  switch i32 %v1, label %c.latch [
+  i32 1, label %b.latch
+  i32 2, label %b.latch
+  i32 3, label %b.latch
+  ]
+
+c.latch:
+  ; Use values from other loops to check LCSSA form.
+  store i32 %x.a, i32* %ptr
+  store i32 %x.b, i32* %ptr
+  %v2 = call i1 @cond()
+  br i1 %v2, label %c.header, label %exit
+
+b.latch:
+  %v3 = call i1 @cond()
+  br i1 %v3, label %b.header, label %a.latch
+
+a.latch:
+  br label %a.header
+
+exit:
+  ret void
+}


        


More information about the llvm-commits mailing list