[llvm] f1b05a0 - [StructurizeCFG] Improve basic block ordering

Wed Jun 22 14:13:29 PDT 2022

Author: Brendon Cahoon
Date: 2022-06-22T16:10:41-05:00
New Revision: f1b05a0a2bbbea160002be709f8a1c59de366761

URL: https://github.com/llvm/llvm-project/commit/f1b05a0a2bbbea160002be709f8a1c59de366761
DIFF: https://github.com/llvm/llvm-project/commit/f1b05a0a2bbbea160002be709f8a1c59de366761.diff

LOG: [StructurizeCFG] Improve basic block ordering

StructurizeCFG linearizes the successors of branching basic block
by adding Flow blocks to record the true/false path for branches
and back edges. This patch reduces the number of Phi values needed
to capture the control flow path by improving the basic block
ordering.

Previously, StructurizeCFG adds loop exit blocks outside of the
loop. StructurizeCFG sets a boolean value to indicate the path
taken, and all exit block live values extend to after the loop.
For loops with a large number of exits blocks, this creates a
huge number of values that are maintained, which increases
compilation time and register pressure. This is problem
especially with ASAN, which adds early exits to blocks with
unreachable instructions for each instrumented check in the loop.

In specific cases, this patch reduces the number of values needed
after the loop by moving the exit block into the loop. This is
done for blocks that have a single predecessor and single successor
by moving the block to appear just after the predecessor.

Differential Revision: https://reviews.llvm.org/D123231

Added: 
    llvm/test/Transforms/StructurizeCFG/improve-order.ll

Modified: 
    llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
    llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 0b797abefe207..f6525ad7de9bc 100644

--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -68,6 +68,11 @@ static cl::opt<bool>
                           cl::desc("Allow relaxed uniform region checks"),
                           cl::init(true));
 
+static cl::opt<unsigned>
+    ReorderNodeSize("structurizecfg-node-reorder-size",
+                     cl::desc("Limit region size for reordering nodes"),
+                     cl::init(100), cl::Hidden);
+
 // Definition of the complex types used in this pass.
 
 using BBValuePair = std::pair<BasicBlock *, Value *>;
@@ -262,6 +267,8 @@ class StructurizeCFG {
 
   void orderNodes();
 
+  void reorderNodes();
+
   void analyzeLoops(RegionNode *N);
 
   Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
@@ -420,6 +427,57 @@ void StructurizeCFG::orderNodes() {
   }
 }
 
+/// Change the node ordering to decrease the range of live values, especially
+/// the values that capture the control flow path for branches. We do this
+/// by moving blocks with a single predecessor and successor to appear after
+/// predecessor. The motivation is to move some loop exit blocks into a loop.
+/// In cases where a loop has a large number of exit blocks, this reduces the
+/// amount of values needed across the loop boundary.
+void StructurizeCFG::reorderNodes() {
+  SmallVector<RegionNode *, 8> NewOrder;
+  DenseMap<BasicBlock *, unsigned> MoveTo;
+  BitVector Moved(Order.size());
+
+  // The benefits of reordering nodes occurs for large regions.
+  if (Order.size() <= ReorderNodeSize)
+    return;
+
+  // The algorithm works with two passes over Order. The first pass identifies
+  // the blocks to move and the position to move them to. The second pass
+  // creates the new order based upon this information. We move blocks with
+  // a single predecessor and successor. If there are multiple candidates then
+  // maintain the original order.
+  BBSet Seen;
+  for (int I = Order.size() - 1; I >= 0; --I) {
+    auto *BB = Order[I]->getEntry();
+    Seen.insert(BB);
+    auto *Pred = BB->getSinglePredecessor();
+    auto *Succ = BB->getSingleSuccessor();
+    // Consider only those basic blocks that have a predecessor in Order and a
+    // successor that exits the region. The region may contain subregions that
+    // have been structurized and are not included in Order.
+    if (Pred && Succ && Seen.count(Pred) && Succ == ParentRegion->getExit() &&
+        !MoveTo.count(Pred)) {
+      MoveTo[Pred] = I;
+      Moved.set(I);
+    }
+  }
+
+  // If no blocks have been moved then the original order is good.
+  if (!Moved.count())
+    return;
+
+  for (size_t I = 0, E = Order.size(); I < E; ++I) {
+    auto *BB = Order[I]->getEntry();
+    if (MoveTo.count(BB))
+      NewOrder.push_back(Order[MoveTo[BB]]);
+    if (!Moved[I])
+      NewOrder.push_back(Order[I]);
+  }
+
+  Order.assign(NewOrder);
+}
+
 /// Determine the end of the loops
 void StructurizeCFG::analyzeLoops(RegionNode *N) {
   if (N->isSubRegion()) {
@@ -1081,6 +1139,7 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
   ParentRegion = R;
 
   orderNodes();
+  reorderNodes();
   collectInfos();
   createFlow();
   insertConditions(false);

diff  --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index f3089e79ad378..2869e996ec724 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
@@ -48,9 +47,9 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
 ; GCN-NEXT:    s_endpgm
 ; IR-LABEL: @reduced_nested_loop_conditions(
 ; IR-NEXT:  bb:
-; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4
+; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4:[0-9]+]]
 ; IR-NEXT:    [[MY_TMP1:%.*]] = getelementptr inbounds i64, i64 addrspace(3)* [[ARG:%.*]], i32 [[MY_TMP]]
-; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, i64 addrspace(3)* [[MY_TMP1]]
+; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, i64 addrspace(3)* [[MY_TMP1]], align 4
 ; IR-NEXT:    br label [[BB5:%.*]]
 ; IR:       bb3:
 ; IR-NEXT:    br i1 true, label [[BB4:%.*]], label [[BB13:%.*]]
@@ -84,7 +83,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
 ; IR:       bb16:
 ; IR-NEXT:    [[MY_TMP17:%.*]] = extractelement <2 x i32> [[MY_TMP15]], i64 1
 ; IR-NEXT:    [[MY_TMP18:%.*]] = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 [[MY_TMP17]]
-; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[MY_TMP18]]
+; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[MY_TMP18]], align 4
 ; IR-NEXT:    br label [[BB20]]
 ; IR:       bb20:
 ; IR-NEXT:    [[MY_TMP21]] = phi i32 [ [[MY_TMP19]], [[BB16]] ], [ 0, [[BB13]] ]
@@ -93,6 +92,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* noca
 ; IR:       bb23:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]])
 ; IR-NEXT:    ret void
+;
 bb:
   %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %my.tmp1 = getelementptr inbounds i64, i64 addrspace(3)* %arg, i32 %my.tmp
@@ -190,19 +190,19 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; GCN-NEXT:    s_endpgm
 ; IR-LABEL: @nested_loop_conditions(
 ; IR-NEXT:  bb:
-; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #4
+; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4]]
 ; IR-NEXT:    [[MY_TMP1:%.*]] = zext i32 [[MY_TMP]] to i64
 ; IR-NEXT:    [[MY_TMP2:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[ARG:%.*]], i64 [[MY_TMP1]]
 ; IR-NEXT:    [[MY_TMP3:%.*]] = load i64, i64 addrspace(1)* [[MY_TMP2]], align 16
 ; IR-NEXT:    [[MY_TMP932:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
 ; IR-NEXT:    [[MY_TMP1033:%.*]] = extractelement <4 x i32> [[MY_TMP932]], i64 0
-; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef
+; IR-NEXT:    [[MY_TMP1134:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
 ; IR-NEXT:    [[MY_TMP1235:%.*]] = icmp slt i32 [[MY_TMP1134]], 9
 ; IR-NEXT:    br i1 [[MY_TMP1235]], label [[BB14_LR_PH:%.*]], label [[FLOW:%.*]]
 ; IR:       bb14.lr.ph:
 ; IR-NEXT:    br label [[BB14:%.*]]
 ; IR:       Flow3:
-; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP21:%.*]])
+; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP20:%.*]])
 ; IR-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP14:%.*]])
 ; IR-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
 ; IR-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
@@ -244,7 +244,7 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; IR-NEXT:    [[TMP17:%.*]] = call i1 @llvm.amdgcn.loop.i64(i64 [[TMP16]])
 ; IR-NEXT:    br i1 [[TMP17]], label [[FLOW2:%.*]], label [[BB14]]
 ; IR:       bb18:
-; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(1)* undef
+; IR-NEXT:    [[MY_TMP19:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
 ; IR-NEXT:    [[MY_TMP20:%.*]] = icmp slt i32 [[MY_TMP19]], 9
 ; IR-NEXT:    br i1 [[MY_TMP20]], label [[BB21]], label [[BB18]]
 ; IR:       bb21:
@@ -261,21 +261,22 @@ define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %a
 ; IR-NEXT:    [[MY_TMP8:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 [[MY_TMP7]]
 ; IR-NEXT:    [[MY_TMP9]] = load <4 x i32>, <4 x i32> addrspace(1)* [[MY_TMP8]], align 16
 ; IR-NEXT:    [[MY_TMP10]] = extractelement <4 x i32> [[MY_TMP9]], i64 0
-; IR-NEXT:    [[MY_TMP11:%.*]] = load volatile i32, i32 addrspace(1)* undef
+; IR-NEXT:    [[MY_TMP11:%.*]] = load volatile i32, i32 addrspace(1)* undef, align 4
 ; IR-NEXT:    [[MY_TMP12]] = icmp sge i32 [[MY_TMP11]], 9
 ; IR-NEXT:    br label [[FLOW1]]
 ; IR:       Flow2:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP16]])
-; IR-NEXT:    [[TMP19:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]])
-; IR-NEXT:    [[TMP20:%.*]] = extractvalue { i1, i64 } [[TMP19]], 0
-; IR-NEXT:    [[TMP21]] = extractvalue { i1, i64 } [[TMP19]], 1
-; IR-NEXT:    br i1 [[TMP20]], label [[BB31_LOOPEXIT:%.*]], label [[FLOW3]]
+; IR-NEXT:    [[TMP18:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]])
+; IR-NEXT:    [[TMP19:%.*]] = extractvalue { i1, i64 } [[TMP18]], 0
+; IR-NEXT:    [[TMP20]] = extractvalue { i1, i64 } [[TMP18]], 1
+; IR-NEXT:    br i1 [[TMP19]], label [[BB31_LOOPEXIT:%.*]], label [[FLOW3]]
 ; IR:       bb31.loopexit:
 ; IR-NEXT:    br label [[FLOW3]]
 ; IR:       bb31:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
-; IR-NEXT:    store volatile i32 0, i32 addrspace(1)* undef
+; IR-NEXT:    store volatile i32 0, i32 addrspace(1)* undef, align 4
 ; IR-NEXT:    ret void
+;
 bb:
   %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %my.tmp1 = zext i32 %my.tmp to i64

diff  --git a/llvm/test/Transforms/StructurizeCFG/improve-order.ll b/llvm/test/Transforms/StructurizeCFG/improve-order.ll
new file mode 100644
index 0000000000000..e40770cb2aa5b
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/improve-order.ll
@@ -0,0 +1,511 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -structurizecfg -structurizecfg-node-reorder-size=3 %s -o - | FileCheck %s
+; RUN: opt -S -passes=structurizecfg -structurizecfg-node-reorder-size=3 %s -o - | FileCheck %s
+
+; Test that exit blocks for a loop are reordered so that they
+; appear after their predecessors rather than after the loop.
+; This reduces the number of values needed after the loop
+; to record if the exit blocks are taken/not taken.
+
+define void @reorder_loop(i1 %PredEntry, i1 %PredA, i1 %PredC) {
+; CHECK-LABEL: @reorder_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[G:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[INC1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1:%.*]], [[FLOW1:%.*]] ]
+; CHECK-NEXT:    br i1 [[PREDA:%.*]], label [[B:%.*]], label [[FLOW:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    tail call fastcc void @check(i32 1) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, [[B]] ], [ true, [[A]] ]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[C:%.*]], label [[FLOW1]]
+; CHECK:       C:
+; CHECK-NEXT:    br i1 [[PREDC:%.*]], label [[D:%.*]], label [[FLOW2:%.*]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[TMP1]] = phi i32 [ [[TMP5:%.*]], [[FLOW3:%.*]] ], [ undef, [[FLOW]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ [[TMP6:%.*]], [[FLOW3]] ], [ false, [[FLOW]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i1 [ [[TMP7:%.*]], [[FLOW3]] ], [ true, [[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[LOOP_EXIT_GUARD:%.*]], label [[A]]
+; CHECK:       D:
+; CHECK-NEXT:    tail call fastcc void @check(i32 2) #[[ATTR0]]
+; CHECK-NEXT:    br label [[FLOW2]]
+; CHECK:       Flow2:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ false, [[D]] ], [ true, [[C]] ]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[E:%.*]], label [[FLOW3]]
+; CHECK:       E:
+; CHECK-NEXT:    [[INC2:%.*]] = add i32 [[INC1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[INC2]], 10
+; CHECK-NEXT:    br label [[FLOW3]]
+; CHECK:       F:
+; CHECK-NEXT:    unreachable
+; CHECK:       G:
+; CHECK-NEXT:    ret void
+; CHECK:       Flow3:
+; CHECK-NEXT:    [[TMP5]] = phi i32 [ [[INC2]], [[E]] ], [ undef, [[FLOW2]] ]
+; CHECK-NEXT:    [[TMP6]] = phi i1 [ true, [[E]] ], [ false, [[FLOW2]] ]
+; CHECK-NEXT:    [[TMP7]] = phi i1 [ [[CMP]], [[E]] ], [ true, [[FLOW2]] ]
+; CHECK-NEXT:    br label [[FLOW1]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    br i1 [[TMP2]], label [[G]], label [[F:%.*]]
+;
+entry:
+  br i1 %PredEntry, label %A, label %G
+
+; A is the loop header.
+A:
+  %inc1 = phi i32 [ 0, %entry ], [ %inc2, %E ]
+  br i1 %PredA, label %B, label %C
+
+; B is a loop exit block. After reorderNodes in StructurizeCFG, this block
+; remains after A. Without reorderNodes, B is dependent on the Flow block for
+; the loop back edge. B is executes only when the loop exits from A. It does not
+; execute during the loop iterations.
+B:
+  tail call fastcc void @check(i32 1) #0
+  br label %loop.exit.guard
+
+C:
+  br i1 %PredC, label %D, label %E
+
+; D is a loop exit block. After reorderNodes, this block remains in the loop
+; and is not dependent on the Flow block that contains the back edge to the
+; loop header.
+D:
+  tail call fastcc void @check(i32 2) #0
+  br label %loop.exit.guard
+
+; E is a latch and exiting block.
+E:
+  %inc2 = add i32 %inc1, 1
+  %cmp = icmp ult i32 %inc2, 10
+  br i1 %cmp, label %A, label %loop.exit.guard
+
+; Paths fom the exit blocks come here through loop.exit.gaurd.
+F:
+  unreachable
+
+G:
+  ret void
+
+; loop.exit.guard is added by UnifyLoopExits as a common block for all loop
+; exits. After StructurizeCFG, the predecessor B will remain in the loop.
+loop.exit.guard:
+  %Guard.G = phi i1 [ true, %E ], [ false, %B ], [ false, %D ]
+  br i1 %Guard.G, label %G, label %F
+}
+
+; Test that the exit blocks for a nested loop are reordered so that they are
+; not dependent on the Flow block that contains the loop back edge. That
+; dependence forces the block to appear after the loop, which increases the
+; number of live values exiting the loop.
+
+define void @reorder_inner_loop(i1 %PredEntry, i1 %PredB, i1 %PredD) {
+; CHECK-LABEL: @reorder_inner_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PREDB_INV:%.*]] = xor i1 [[PREDB:%.*]], true
+; CHECK-NEXT:    br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[I:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[OUTER1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FLOW4:%.*]] ]
+; CHECK-NEXT:    br label [[B:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    [[INNER1:%.*]] = phi i32 [ [[TMP1:%.*]], [[FLOW1:%.*]] ], [ 0, [[A]] ]
+; CHECK-NEXT:    br i1 [[PREDB_INV]], label [[C:%.*]], label [[FLOW:%.*]]
+; CHECK:       C:
+; CHECK-NEXT:    tail call fastcc void @check(i32 1) #[[ATTR0]]
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, [[C]] ], [ true, [[B]] ]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[D:%.*]], label [[FLOW1]]
+; CHECK:       D:
+; CHECK-NEXT:    br i1 [[PREDD:%.*]], label [[E:%.*]], label [[FLOW2:%.*]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[TMP1]] = phi i32 [ [[TMP8:%.*]], [[FLOW3:%.*]] ], [ undef, [[FLOW]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ [[TMP9:%.*]], [[FLOW3]] ], [ false, [[FLOW]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i1 [ [[TMP10:%.*]], [[FLOW3]] ], [ true, [[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[LOOP_EXIT_GUARD1:%.*]], label [[B]]
+; CHECK:       E:
+; CHECK-NEXT:    tail call fastcc void @check(i32 2) #[[ATTR0]]
+; CHECK-NEXT:    br label [[FLOW2]]
+; CHECK:       Flow2:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ false, [[E]] ], [ true, [[D]] ]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[F:%.*]], label [[FLOW3]]
+; CHECK:       F:
+; CHECK-NEXT:    [[INNER2:%.*]] = add i32 [[INNER1]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i32 [[INNER2]], 20
+; CHECK-NEXT:    br label [[FLOW3]]
+; CHECK:       G:
+; CHECK-NEXT:    [[OUTER2:%.*]] = add i32 [[OUTER1]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i32 [[OUTER2]], 10
+; CHECK-NEXT:    br label [[FLOW4]]
+; CHECK:       H:
+; CHECK-NEXT:    unreachable
+; CHECK:       I:
+; CHECK-NEXT:    ret void
+; CHECK:       Flow4:
+; CHECK-NEXT:    [[TMP5]] = phi i32 [ [[OUTER2]], [[G:%.*]] ], [ undef, [[LOOP_EXIT_GUARD1]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ true, [[G]] ], [ false, [[LOOP_EXIT_GUARD1]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi i1 [ [[CMP2]], [[G]] ], [ true, [[LOOP_EXIT_GUARD1]] ]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOOP_EXIT_GUARD:%.*]], label [[A]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    br i1 [[TMP6]], label [[I]], label [[H:%.*]]
+; CHECK:       Flow3:
+; CHECK-NEXT:    [[TMP8]] = phi i32 [ [[INNER2]], [[F]] ], [ undef, [[FLOW2]] ]
+; CHECK-NEXT:    [[TMP9]] = phi i1 [ true, [[F]] ], [ false, [[FLOW2]] ]
+; CHECK-NEXT:    [[TMP10]] = phi i1 [ [[CMP1]], [[F]] ], [ true, [[FLOW2]] ]
+; CHECK-NEXT:    br label [[FLOW1]]
+; CHECK:       loop.exit.guard1:
+; CHECK-NEXT:    br i1 [[TMP2]], label [[G]], label [[FLOW4]]
+;
+entry:
+  br i1 %PredEntry, label %A, label %I
+
+; A is the outer loop header block.
+A:
+  %outer1 = phi i32 [ 0, %entry ], [ %outer2, %G ]
+  br label %B
+
+; B is the inner loop header block.
+B:
+  %inner1 = phi i32 [ 0, %A ], [ %inner2, %F ]
+  br i1 %PredB, label %D, label %C
+
+; C is a loop exit block. After reorderNodes in StructureCFG, this block
+; remains after B rather and is not dependent on the Flow block that contains
+; the back edge to the loop header.
+C:
+  tail call fastcc void @check(i32 1) #0
+  br label %loop.exit.guard1
+
+D:
+  br i1 %PredD, label %E, label %F
+
+; E is a loop exit block. After reorderNodes in StructurizeCFG, this block
+; remains in the loop and is not dependent on the Flow block that contains
+; the back edge to the loop header.
+E:
+  tail call fastcc void @check(i32 2) #0
+  br label %loop.exit.guard1
+
+; F is a latch block for the inner loop.
+F:
+  %inner2 = add i32 %inner1, 1
+  %cmp1 = icmp ult i32 %inner2, 20
+  br i1 %cmp1, label %B, label %loop.exit.guard1
+
+; G is a latch block for the outer loop.
+G:
+  %outer2 = add i32 %outer1, 1
+  %cmp2 = icmp ult i32 %outer2, 10
+  br i1 %cmp2, label %A, label %loop.exit.guard
+
+; Paths from the exit blocks come here through loop.exit.gaurd.
+H:
+  unreachable
+
+I:
+  ret void
+
+; loop.exit.guard is added by UnifyLoopExits as a common block for all loop
+; exits. This instance is for the outer loop.
+loop.exit.guard:
+  %Guard.I = phi i1 [ true, %G ], [ %Guard.I.moved, %loop.exit.guard1 ]
+  br i1 %Guard.I, label %I, label %H
+
+; loop.exit.guard is added for the inner loop.
+loop.exit.guard1:
+  %Guard.G = phi i1 [ true, %F ], [ false, %C ], [ false, %E ]
+  %Guard.I.moved = phi i1 [ undef, %F ], [ false, %C ], [ false, %E ]
+  br i1 %Guard.G, label %G, label %loop.exit.guard
+}
+
+; Test when the common successor is not unreachable. In particular, the common
+; successor is inside the region. In this case, it is still helpful to keep
+; the exit blocks inside the loop to reduce live values that exit the loop.
+
+define void @common_exit(i1 %PredEntry, i1 %PredA, i1 %PredB) {
+; CHECK-LABEL: @common_exit(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PREDB_INV:%.*]] = xor i1 [[PREDB:%.*]], true
+; CHECK-NEXT:    [[PREDA_INV:%.*]] = xor i1 [[PREDA:%.*]], true
+; CHECK-NEXT:    br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[G:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[INC1:%.*]] = phi i32 [ [[TMP4:%.*]], [[FLOW:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br i1 [[PREDA_INV]], label [[C:%.*]], label [[FLOW]]
+; CHECK:       Flow3:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ true, [[D:%.*]] ], [ false, [[FLOW2:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP6:%.*]], label [[B:%.*]], label [[FLOW4:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    tail call fastcc void @check1(i32 1) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    br label [[FLOW4]]
+; CHECK:       C:
+; CHECK-NEXT:    br i1 [[PREDB_INV]], label [[E:%.*]], label [[FLOW1:%.*]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[INC2:%.*]], [[E]] ], [ undef, [[C]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ [[CMP:%.*]], [[E]] ], [ true, [[C]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i1 [ false, [[E]] ], [ true, [[C]] ]
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       Flow2:
+; CHECK-NEXT:    br i1 [[TMP7:%.*]], label [[D]], label [[FLOW3:%.*]]
+; CHECK:       D:
+; CHECK-NEXT:    tail call fastcc void @check1(i32 2) #[[ATTR1]]
+; CHECK-NEXT:    br label [[FLOW3]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP4]] = phi i32 [ [[TMP1]], [[FLOW1]] ], [ undef, [[A]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ true, [[A]] ]
+; CHECK-NEXT:    [[TMP6]] = phi i1 [ false, [[FLOW1]] ], [ true, [[A]] ]
+; CHECK-NEXT:    [[TMP7]] = phi i1 [ [[TMP3]], [[FLOW1]] ], [ false, [[A]] ]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[FLOW2]], label [[A]]
+; CHECK:       E:
+; CHECK-NEXT:    [[INC2]] = add i32 [[INC1]], 1
+; CHECK-NEXT:    [[CMP]] = icmp uge i32 [[INC2]], 10
+; CHECK-NEXT:    br label [[FLOW1]]
+; CHECK:       Flow4:
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i1 [ true, [[B]] ], [ [[TMP0]], [[FLOW3]] ]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[F:%.*]], label [[FLOW5:%.*]]
+; CHECK:       F:
+; CHECK-NEXT:    br label [[FLOW5]]
+; CHECK:       Flow5:
+; CHECK-NEXT:    br label [[G]]
+; CHECK:       G:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %PredEntry, label %A, label %G
+
+; A is the loop header.
+A:
+  %inc1 = phi i32 [ 0, %entry ], [ %inc2, %E ]
+  br i1 %PredA, label %B, label %C
+
+; B is a loop exit block. After reorderNodes in StructurizeCFG, this block
+; remains after A rather than moving to after the loop due to a dependence on
+; the Flow block that contains the loop back edge.
+B:
+  tail call fastcc void @check1(i32 1) #1
+  br label %F
+
+C:
+  br i1 %PredB, label %D, label %E
+
+; D is a loop exit block. After reorderNodes, this block remains in the loop
+; rather than moving to after the loop due to a dependence on the Flow block
+; that contains the loop back edge.
+D:
+  tail call fastcc void @check1(i32 2) #1
+  br label %F
+
+; E is a latch and exiting block.
+E:
+  %inc2 = add i32 %inc1, 1
+  %cmp = icmp ult i32 %inc2, 10
+  br i1 %cmp, label %A, label %G
+
+; F is reached by the loop exit blocks, B and D.
+F:
+  br label %G
+
+G:
+  ret void
+}
+
+; Test when there are many loop exiting blocks. Previous tests have two exiting
+; blocks. This test contains five. Each of the exiting blocks remain in the loop
+; rather than being moved to after the loop.
+
+define void @many_exiting_blocks(i1 %PredEntry, i1 %PredA, i1 %PredC, i1 %PredE, i1 %PredG) {
+; CHECK-LABEL: @many_exiting_blocks(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[PREDENTRY:%.*]], label [[A:%.*]], label [[FLOW11:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[INC1:%.*]] = phi i32 [ [[TMP1:%.*]], [[FLOW4:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br i1 [[PREDA:%.*]], label [[B:%.*]], label [[FLOW3:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    tail call fastcc void @check(i32 1) #[[ATTR0]]
+; CHECK-NEXT:    br label [[FLOW3]]
+; CHECK:       Flow3:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, [[B]] ], [ true, [[A]] ]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[C:%.*]], label [[FLOW4]]
+; CHECK:       C:
+; CHECK-NEXT:    br i1 [[PREDC:%.*]], label [[D:%.*]], label [[FLOW5:%.*]]
+; CHECK:       Flow4:
+; CHECK-NEXT:    [[TMP1]] = phi i32 [ [[TMP6:%.*]], [[FLOW6:%.*]] ], [ undef, [[FLOW3]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ [[TMP7:%.*]], [[FLOW6]] ], [ true, [[FLOW3]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i1 [ [[TMP8:%.*]], [[FLOW6]] ], [ false, [[FLOW3]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ [[TMP9:%.*]], [[FLOW6]] ], [ true, [[FLOW3]] ]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOOP_EXIT_GUARD:%.*]], label [[A]]
+; CHECK:       D:
+; CHECK-NEXT:    tail call fastcc void @check(i32 2) #[[ATTR0]]
+; CHECK-NEXT:    br label [[FLOW5]]
+; CHECK:       Flow5:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ false, [[D]] ], [ true, [[C]] ]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[E:%.*]], label [[FLOW6]]
+; CHECK:       E:
+; CHECK-NEXT:    br i1 [[PREDE:%.*]], label [[F:%.*]], label [[FLOW7:%.*]]
+; CHECK:       Flow6:
+; CHECK-NEXT:    [[TMP6]] = phi i32 [ [[TMP12:%.*]], [[FLOW8:%.*]] ], [ undef, [[FLOW5]] ]
+; CHECK-NEXT:    [[TMP7]] = phi i1 [ [[TMP13:%.*]], [[FLOW8]] ], [ true, [[FLOW5]] ]
+; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP14:%.*]], [[FLOW8]] ], [ false, [[FLOW5]] ]
+; CHECK-NEXT:    [[TMP9]] = phi i1 [ [[TMP15:%.*]], [[FLOW8]] ], [ true, [[FLOW5]] ]
+; CHECK-NEXT:    br label [[FLOW4]]
+; CHECK:       F:
+; CHECK-NEXT:    tail call fastcc void @check(i32 3) #[[ATTR0]]
+; CHECK-NEXT:    br label [[FLOW7]]
+; CHECK:       Flow7:
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ false, [[F]] ], [ true, [[E]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i1 [ false, [[F]] ], [ true, [[E]] ]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[G:%.*]], label [[FLOW8]]
+; CHECK:       G:
+; CHECK-NEXT:    br i1 [[PREDG:%.*]], label [[H:%.*]], label [[FLOW9:%.*]]
+; CHECK:       Flow8:
+; CHECK-NEXT:    [[TMP12]] = phi i32 [ [[TMP19:%.*]], [[FLOW10:%.*]] ], [ undef, [[FLOW7]] ]
+; CHECK-NEXT:    [[TMP13]] = phi i1 [ [[TMP20:%.*]], [[FLOW10]] ], [ [[TMP10]], [[FLOW7]] ]
+; CHECK-NEXT:    [[TMP14]] = phi i1 [ [[TMP21:%.*]], [[FLOW10]] ], [ false, [[FLOW7]] ]
+; CHECK-NEXT:    [[TMP15]] = phi i1 [ [[TMP22:%.*]], [[FLOW10]] ], [ true, [[FLOW7]] ]
+; CHECK-NEXT:    br label [[FLOW6]]
+; CHECK:       H:
+; CHECK-NEXT:    tail call fastcc void @check(i32 4) #[[ATTR0]]
+; CHECK-NEXT:    br label [[FLOW9]]
+; CHECK:       Flow9:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi i1 [ false, [[H]] ], [ [[TMP10]], [[G]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = phi i1 [ false, [[H]] ], [ true, [[G]] ]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[I:%.*]], label [[FLOW10]]
+; CHECK:       I:
+; CHECK-NEXT:    [[INC2:%.*]] = add i32 [[INC1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[INC2]], 10
+; CHECK-NEXT:    br label [[FLOW10]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi i1 [ false, [[K:%.*]] ], [ true, [[LOOP_EXIT_GUARD1:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[J:%.*]], label [[FLOW1:%.*]]
+; CHECK:       J:
+; CHECK-NEXT:    br label [[FLOW1]]
+; CHECK:       K:
+; CHECK-NEXT:    br label [[FLOW:%.*]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    br label [[FLOW2:%.*]]
+; CHECK:       Flow2:
+; CHECK-NEXT:    br label [[FLOW11]]
+; CHECK:       L:
+; CHECK-NEXT:    ret void
+; CHECK:       Flow10:
+; CHECK-NEXT:    [[TMP19]] = phi i32 [ [[INC2]], [[I]] ], [ undef, [[FLOW9]] ]
+; CHECK-NEXT:    [[TMP20]] = phi i1 [ false, [[I]] ], [ [[TMP16]], [[FLOW9]] ]
+; CHECK-NEXT:    [[TMP21]] = phi i1 [ true, [[I]] ], [ false, [[FLOW9]] ]
+; CHECK-NEXT:    [[TMP22]] = phi i1 [ [[CMP]], [[I]] ], [ true, [[FLOW9]] ]
+; CHECK-NEXT:    br label [[FLOW8]]
+; CHECK:       Flow11:
+; CHECK-NEXT:    br label [[L:%.*]]
+; CHECK:       loop.exit.guard:
+; CHECK-NEXT:    [[GUARD_L_INV:%.*]] = xor i1 [[TMP3]], true
+; CHECK-NEXT:    [[GUARD_J_INV:%.*]] = xor i1 [[TMP2]], true
+; CHECK-NEXT:    br i1 [[GUARD_L_INV]], label [[LOOP_EXIT_GUARD1]], label [[FLOW2]]
+; CHECK:       loop.exit.guard1:
+; CHECK-NEXT:    br i1 [[GUARD_J_INV]], label [[K]], label [[FLOW]]
+;
+entry:
+  br i1 %PredEntry, label %A, label %L
+
+; A is the loop header block.
+A:
+  %inc1 = phi i32 [ 0, %entry ], [ %inc2, %I ]
+  br i1 %PredA, label %B, label %C
+
+; B is a loop exit block. After reorderNodes in StructurizeCFG, this block
+; remains after A rather than moving to after the loop due to a dependence
+; on the Flow block that contains the loop back edge.
+B:
+  tail call fastcc void @check(i32 1) #0
+  br label %loop.exit.guard
+
+C:
+  br i1 %PredC, label %D, label %E
+
+; D is a loop exit block. After reorderNodes, this block remains in the loop
+; rather than moving to after the loop.
+D:
+  tail call fastcc void @check(i32 2) #0
+  br label %loop.exit.guard
+
+E:
+  br i1 %PredE, label %F, label %G
+
+; F is a loop exit block. After reorderNodes, this block remains in the loop
+; rather than moving to after the loop.
+F:
+  tail call fastcc void @check(i32 3) #0
+  br label %loop.exit.guard
+
+G:
+  br i1 %PredG, label %H, label %I
+
+; H is a loop exit block. After reorderNodes, this block remains in the loop
+; rather than moving to after the loop.
+H:
+  tail call fastcc void @check(i32 4) #0
+  br label %loop.exit.guard
+
+; I is a latch block.
+I:
+  %inc2 = add i32 %inc1, 1
+  %cmp = icmp ult i32 %inc2, 10
+  br i1 %cmp, label %A, label %loop.exit.guard
+
+J:
+  br label %L
+
+K:
+  br label %L
+
+L:
+  ret void
+
+loop.exit.guard:
+  %Guard.L = phi i1 [ true, %I ], [ false, %B ], [ false, %D ], [ false, %F ], [ false, %H ]
+  %Guard.J = phi i1 [ false, %I ], [ true, %B ], [ true, %D ], [ false, %F ], [ false, %H ]
+  br i1 %Guard.L, label %L, label %loop.exit.guard1
+
+loop.exit.guard1:
+  br i1 %Guard.J, label %J, label %K
+}
+
+declare void @check(i32 noundef) #0
+declare void @check1(i32 noundef) #1
+
+attributes #0 = { noreturn nounwind }
+attributes #1 = { nounwind }
+
+
+; When there are two blocks that can be moved closer to the predecessor,
+; maintain the original, relative order since swapping the order should not
+; provide any benefit. In this case both B and C have a single predecessor
+; and a single successor.
+
+define void @same_predecessor(i1 %PredA) {
+; CHECK-LABEL: @same_predecessor(
+; CHECK-NEXT:  A:
+; CHECK-NEXT:    [[PREDA_INV:%.*]] = xor i1 [[PREDA:%.*]], true
+; CHECK-NEXT:    br i1 [[PREDA_INV]], label [[B:%.*]], label [[FLOW:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, [[B]] ], [ true, [[A:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[C:%.*]], label [[D:%.*]]
+; CHECK:       C:
+; CHECK-NEXT:    br label [[D]]
+; CHECK:       D:
+; CHECK-NEXT:    ret void
+;
+A:
+  br i1 %PredA, label %C, label %B
+
+B:
+  br label %D
+
+C:
+  br label %D
+
+D:
+  ret void
+}
+
+