[llvm] [LoopUnswitch] Improve algorithm of unswitch cost estimation (PR #106544)

Thu Aug 29 04:57:13 PDT 2024

https://github.com/skachkov-sc created https://github.com/llvm/llvm-project/pull/106544

**Motivation**
Current cost calculation algorithm in SimpleLoopUnswitch can overestimate the cost for loops that contain switches with fallthrough statements. Example:
```
void test(int n, int x) {
    for(int i = 0; i < n; ++i)
        switch(x) {
        case 1:
            foo();
            break;
        case 2:
            bar();
            [[fallthrough]];
        case 3:
            baz();
        }
}
```
(godbolt link: https://godbolt.org/z/nEx69fW6M)
The current approach is the following: find set of blocks that will be presented only once after loop cloning (this is done by finding DominatorTree subtrees of unswitch candidate); let's call their total cost as CostOfBlocksNotCloned, and the total cost of the loop body as LoopCost. In that case, the unswitching cost is:
```
UnswitchCost = (LoopCost - CostOfBlocksNotCloned) * (NumClones - 1)
```
The idea here is that LoopCost - CostOfBlocksNotCloned gives the cost of blocks that will be cloned, and then it's multiplied by NumClones - 1 (because these blocks are already presented in the original loop, we substract one to find the additional cost of unswitching). However, this approach assumes that each basic block in the loop will be presented only once or NumClones times after unswitching, and this is wrong for more complex CFG with switches and fallthrough statements. In the given example, case 3 (baz) is not dominated by any edge from switch statement, so the current algorithm will estimate it as cloned 3 times; but actually it will be presented only twice (in case 2 and case 3 unswitched loops). If we take this fact into account, the cost of unswitching can be reduced from 18 to 14, and in some situations it can unblock unswitching of loops with big switch statements (overestimation can be significant for switches with big number of cases).

**Proposed solution**
New algorithm tries to precisely estimate how many times each loop block will be cloned (NumDuplicationsMap contains number of clones for each BB). This is done by DFS traversal of loop starting from its header, skipping CFG edges that are not presented in this unswitched loop clone. For each reachable block we update NumDuplicationsMap, and then calculate the unswitch cost as a sum of BB cost multiplied by number of its clones minus one.

Results on llvm-test-suite (the only changed test is lencod):
```
Program                                                      simple-loop-unswitch.NumSwitches                     
                                                             before                           after          diff 
test-suite :: MultiSource/Applications/JM/lencod/lencod.test   2.00                             3.00         50.0%
```
It's also noticeable that simple-loop-unswitch.NumBranches metric doesn't change; this means that new algorithm is equivalent to the old one for unswitch candidates with 2 successors.


>From 048b08c948debcce9f6799ba8ec8270c0e009251 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Fri, 2 Aug 2024 15:25:34 +0300
Subject: [PATCH 1/2] [LoopUnswitch][NFC] Add pre-commit test

---
 .../SimpleLoopUnswitch/switch-cost-model.ll   | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 llvm/test/Transforms/SimpleLoopUnswitch/switch-cost-model.ll

diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/switch-cost-model.ll b/llvm/test/Transforms/SimpleLoopUnswitch/switch-cost-model.ll
new file mode 100644
index 00000000000000..8f48de614d1cd0
--- /dev/null
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/switch-cost-model.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='simple-loop-unswitch<nontrivial>' -unswitch-threshold=15 -S < %s | FileCheck %s
+
+declare void @foo1()
+declare void @foo2()
+declare void @foo3()
+
+define void @test(i32 %n, i32 %x) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i32 [[N:%.*]], i32 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    switch i32 [[X]], label [[FOR_INC]] [
+; CHECK-NEXT:      i32 1, label [[SW_BB1:%.*]]
+; CHECK-NEXT:      i32 2, label [[SW_BB2:%.*]]
+; CHECK-NEXT:      i32 3, label [[SW_BB3:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.bb1:
+; CHECK-NEXT:    call void @foo1()
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       sw.bb2:
+; CHECK-NEXT:    call void @foo2()
+; CHECK-NEXT:    br label [[SW_BB3]]
+; CHECK:       sw.bb3:
+; CHECK-NEXT:    call void @foo3()
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:
+  %i = phi i32 [ %inc, %for.inc ], [ 0, %entry ]
+  switch i32 %x, label %for.inc [
+  i32 1, label %sw.bb1
+  i32 2, label %sw.bb2
+  i32 3, label %sw.bb3
+  ]
+
+sw.bb1:
+  call void @foo1()
+  br label %for.inc
+
+sw.bb2:
+  call void @foo2()
+  br label %sw.bb3
+
+sw.bb3:
+  call void @foo3()
+  br label %for.inc
+
+for.inc:
+  %inc = add nuw nsw i32 %i, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}

>From 385192592af76fd8d368098355c0a94ca4dac3cf Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Thu, 1 Aug 2024 18:02:22 +0300
Subject: [PATCH 2/2] [LoopUnswitch] Improve algorithm of unswitch cost
 estimation

---
 .../Transforms/Scalar/SimpleLoopUnswitch.cpp  | 145 ++++++++----------
 .../SimpleLoopUnswitch/switch-cost-model.ll   |  70 +++++++--
 2 files changed, 120 insertions(+), 95 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index c235d2fb2a5bd4..411f80dfb07d20 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2651,38 +2651,29 @@ static void unswitchNontrivialInvariants(
     ++NumSwitches;
 }
 
-/// Recursively compute the cost of a dominator subtree based on the per-block
-/// cost map provided.
-///
-/// The recursive computation is memozied into the provided DT-indexed cost map
-/// to allow querying it for most nodes in the domtree without it becoming
-/// quadratic.
-static InstructionCost computeDomSubtreeCost(
-    DomTreeNode &N,
-    const SmallDenseMap<BasicBlock *, InstructionCost, 4> &BBCostMap,
-    SmallDenseMap<DomTreeNode *, InstructionCost, 4> &DTCostMap) {
-  // Don't accumulate cost (or recurse through) blocks not in our block cost
-  // map and thus not part of the duplication cost being considered.
-  auto BBCostIt = BBCostMap.find(N.getBlock());
-  if (BBCostIt == BBCostMap.end())
-    return 0;
-
-  // Lookup this node to see if we already computed its cost.
-  auto DTCostIt = DTCostMap.find(&N);
-  if (DTCostIt != DTCostMap.end())
-    return DTCostIt->second;
-
-  // If not, we have to compute it. We can't use insert above and update
-  // because computing the cost may insert more things into the map.
-  InstructionCost Cost = std::accumulate(
-      N.begin(), N.end(), BBCostIt->second,
-      [&](InstructionCost Sum, DomTreeNode *ChildN) -> InstructionCost {
-        return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap);
-      });
-  bool Inserted = DTCostMap.insert({&N, Cost}).second;
-  (void)Inserted;
-  assert(Inserted && "Should not insert a node while visiting children!");
-  return Cost;
+// Determine which loop blocks are reachable after deletion of all successor
+// edges of BB except the ones enumerated into Succs list.
+static void findDuplicatedBlocks(
+    SmallDenseMap<BasicBlock *, unsigned, 4> &NumDuplicationsMap,
+    BasicBlock *BB, ArrayRef<BasicBlock *> Succs, const Loop &L) {
+  SmallVector<BasicBlock *, 4> Worklist({L.getHeader()});
+  SmallPtrSet<BasicBlock *, 4> Visited;
+
+  while (!Worklist.empty()) {
+    auto *CurBB = Worklist.pop_back_val();
+    if (Visited.contains(CurBB))
+      continue;
+    ++NumDuplicationsMap[CurBB];
+    Visited.insert(CurBB);
+    auto AddLoopBlocks = [&](auto Blocks) {
+      copy_if(Blocks, std::back_inserter(Worklist),
+              [&](auto *Block) { return L.contains(Block); });
+    };
+    if (CurBB == BB)
+      AddLoopBlocks(Succs);
+    else
+      AddLoopBlocks(successors(CurBB));
+  }
 }
 
 /// Turns a select instruction into implicit control flow branch,
@@ -3360,10 +3351,8 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate(
   // We prioritize reducing fanout of unswitch candidates provided the cost
   // remains below the threshold because this has a multiplicative effect.
   //
-  // This requires memoizing each dominator subtree to avoid redundant work.
-  //
   // FIXME: Need to actually do the number of candidates part above.
-  SmallDenseMap<DomTreeNode *, InstructionCost, 4> DTCostMap;
+
   // Given a terminator which might be unswitched, computes the non-duplicated
   // cost for that terminator.
   auto ComputeUnswitchedCost = [&](Instruction &TI,
@@ -3372,59 +3361,51 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate(
     if (isa<SelectInst>(TI))
       return LoopCost;
 
-    BasicBlock &BB = *TI.getParent();
-    SmallPtrSet<BasicBlock *, 4> Visited;
+    BasicBlock *BB = TI.getParent();
+    SmallDenseMap<BasicBlock *, unsigned, 4> NumDuplicationsMap;
+
+    // If this is a partial unswitch candidate, then it must be a conditional
+    // branch with a condition of either `or`, `and`, their corresponding
+    // select forms or partially invariant instructions. In that case, one of
+    // the successors is necessarily duplicated; remember it so it will be
+    // presented in every unswitched copy of the loop.
+    BasicBlock *AlwaysDuplicated = nullptr;
+    if (!FullUnswitch) {
+      auto &BI = cast<BranchInst>(TI);
+      Value *Cond = skipTrivialSelect(BI.getCondition());
+      if (match(Cond, m_LogicalAnd()))
+        AlwaysDuplicated = BI.getSuccessor(1);
+      else if (match(Cond, m_LogicalOr()))
+        AlwaysDuplicated = BI.getSuccessor(0);
+      else
+        AlwaysDuplicated =
+            BI.getSuccessor(PartialIVInfo.KnownValue->isOneValue() ? 0 : 1);
+    }
 
-    InstructionCost Cost = 0;
-    for (BasicBlock *SuccBB : successors(&BB)) {
+    SmallPtrSet<BasicBlock *, 4> VisitedSuccs;
+    for (BasicBlock *SuccBB : successors(BB)) {
       // Don't count successors more than once.
-      if (!Visited.insert(SuccBB).second)
+      if (!VisitedSuccs.insert(SuccBB).second)
         continue;
+      SmallVector<BasicBlock *, 2> Succs;
+      Succs.push_back(SuccBB);
+      if (AlwaysDuplicated && AlwaysDuplicated != SuccBB)
+        Succs.push_back(AlwaysDuplicated);
 
-      // If this is a partial unswitch candidate, then it must be a conditional
-      // branch with a condition of either `or`, `and`, their corresponding
-      // select forms or partially invariant instructions. In that case, one of
-      // the successors is necessarily duplicated, so don't even try to remove
-      // its cost.
-      if (!FullUnswitch) {
-        auto &BI = cast<BranchInst>(TI);
-        Value *Cond = skipTrivialSelect(BI.getCondition());
-        if (match(Cond, m_LogicalAnd())) {
-          if (SuccBB == BI.getSuccessor(1))
-            continue;
-        } else if (match(Cond, m_LogicalOr())) {
-          if (SuccBB == BI.getSuccessor(0))
-            continue;
-        } else if ((PartialIVInfo.KnownValue->isOneValue() &&
-                    SuccBB == BI.getSuccessor(0)) ||
-                   (!PartialIVInfo.KnownValue->isOneValue() &&
-                    SuccBB == BI.getSuccessor(1)))
-          continue;
-      }
-
-      // This successor's domtree will not need to be duplicated after
-      // unswitching if the edge to the successor dominates it (and thus the
-      // entire tree). This essentially means there is no other path into this
-      // subtree and so it will end up live in only one clone of the loop.
-      if (SuccBB->getUniquePredecessor() ||
-          llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
-            return PredBB == &BB || DT.dominates(SuccBB, PredBB);
-          })) {
-        Cost += computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap);
-        assert(Cost <= LoopCost &&
-               "Non-duplicated cost should never exceed total loop cost!");
-      }
+      findDuplicatedBlocks(NumDuplicationsMap, BB, Succs, L);
     }
 
-    // Now scale the cost by the number of unique successors minus one. We
-    // subtract one because there is already at least one copy of the entire
-    // loop. This is computing the new cost of unswitching a condition.
-    // Note that guards always have 2 unique successors that are implicit and
-    // will be materialized if we decide to unswitch it.
-    int SuccessorsCount = isGuard(&TI) ? 2 : Visited.size();
-    assert(SuccessorsCount > 1 &&
-           "Cannot unswitch a condition without multiple distinct successors!");
-    return (LoopCost - Cost) * (SuccessorsCount - 1);
+    // Accumulate the cost of each basic block scaled by the number of its
+    // duplications minus one. We subtract one because there is already at least
+    // one copy of the entire loop. This is computing the new cost of
+    // unswitching a condition. Note that guards always have 2 unique successors
+    // that are implicit and will be materialized if we decide to unswitch it.
+    return std::accumulate(NumDuplicationsMap.begin(), NumDuplicationsMap.end(),
+                           InstructionCost(),
+                           [&](InstructionCost Cost, auto Val) {
+                             auto [BB, Num] = Val;
+                             return Cost + BBCostMap[BB] * (Num - 1);
+                           });
   };
 
   std::optional<NonTrivialUnswitchCandidate> Best;
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/switch-cost-model.ll b/llvm/test/Transforms/SimpleLoopUnswitch/switch-cost-model.ll
index 8f48de614d1cd0..988b17605c16d9 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/switch-cost-model.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/switch-cost-model.ll
@@ -12,27 +12,71 @@ define void @test(i32 %n, i32 %x) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    switch i32 [[X]], label [[FOR_INC]] [
-; CHECK-NEXT:      i32 1, label [[SW_BB1:%.*]]
-; CHECK-NEXT:      i32 2, label [[SW_BB2:%.*]]
-; CHECK-NEXT:      i32 3, label [[SW_BB3:%.*]]
+; CHECK-NEXT:    switch i32 [[X]], label [[FOR_BODY_PREHEADER_SPLIT:%.*]] [
+; CHECK-NEXT:      i32 1, label [[FOR_BODY_PREHEADER_SPLIT_US:%.*]]
+; CHECK-NEXT:      i32 2, label [[FOR_BODY_PREHEADER_SPLIT_US1:%.*]]
+; CHECK-NEXT:      i32 3, label [[FOR_BODY_PREHEADER_SPLIT_US8:%.*]]
 ; CHECK-NEXT:    ]
-; CHECK:       sw.bb1:
+; CHECK:       for.body.preheader.split.us:
+; CHECK-NEXT:    br label [[FOR_BODY_US:%.*]]
+; CHECK:       for.body.us:
+; CHECK-NEXT:    [[I_US:%.*]] = phi i32 [ [[INC_US:%.*]], [[FOR_INC_US:%.*]] ], [ 0, [[FOR_BODY_PREHEADER_SPLIT_US]] ]
+; CHECK-NEXT:    br label [[SW_BB1_US:%.*]]
+; CHECK:       sw.bb1.us:
 ; CHECK-NEXT:    call void @foo1()
-; CHECK-NEXT:    br label [[FOR_INC]]
-; CHECK:       sw.bb2:
+; CHECK-NEXT:    br label [[FOR_INC_US]]
+; CHECK:       for.inc.us:
+; CHECK-NEXT:    [[INC_US]] = add nuw nsw i32 [[I_US]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]]
+; CHECK:       for.cond.cleanup.loopexit.split.us:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
+; CHECK:       for.body.preheader.split.us1:
+; CHECK-NEXT:    br label [[FOR_BODY_US2:%.*]]
+; CHECK:       for.body.us2:
+; CHECK-NEXT:    [[I_US3:%.*]] = phi i32 [ [[INC_US6:%.*]], [[FOR_INC_US5:%.*]] ], [ 0, [[FOR_BODY_PREHEADER_SPLIT_US1]] ]
+; CHECK-NEXT:    br label [[SW_BB2_US:%.*]]
+; CHECK:       sw.bb2.us:
 ; CHECK-NEXT:    call void @foo2()
-; CHECK-NEXT:    br label [[SW_BB3]]
-; CHECK:       sw.bb3:
+; CHECK-NEXT:    br label [[SW_BB3_US4:%.*]]
+; CHECK:       sw.bb3.us4:
 ; CHECK-NEXT:    call void @foo3()
+; CHECK-NEXT:    br label [[FOR_INC_US5]]
+; CHECK:       for.inc.us5:
+; CHECK-NEXT:    [[INC_US6]] = add nuw nsw i32 [[I_US3]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_US7:%.*]] = icmp eq i32 [[INC_US6]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_US7]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_SPLIT_US:%.*]], label [[FOR_BODY_US2]]
+; CHECK:       for.cond.cleanup.loopexit.split.split.us:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT:%.*]]
+; CHECK:       for.body.preheader.split.us8:
+; CHECK-NEXT:    br label [[FOR_BODY_US9:%.*]]
+; CHECK:       for.body.us9:
+; CHECK-NEXT:    [[I_US10:%.*]] = phi i32 [ [[INC_US13:%.*]], [[FOR_INC_US12:%.*]] ], [ 0, [[FOR_BODY_PREHEADER_SPLIT_US8]] ]
+; CHECK-NEXT:    br label [[SW_BB3_US11:%.*]]
+; CHECK:       sw.bb3.us11:
+; CHECK-NEXT:    call void @foo3()
+; CHECK-NEXT:    br label [[FOR_INC_US12]]
+; CHECK:       for.inc.us12:
+; CHECK-NEXT:    [[INC_US13]] = add nuw nsw i32 [[I_US10]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_US14:%.*]] = icmp eq i32 [[INC_US13]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_US14]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_SPLIT_SPLIT_US:%.*]], label [[FOR_BODY_US9]]
+; CHECK:       for.cond.cleanup.loopexit.split.split.split.us:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_SPLIT:%.*]]
+; CHECK:       for.body.preheader.split:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER_SPLIT]] ]
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_SPLIT_SPLIT:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup.loopexit.split.split.split:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_SPLIT]]
+; CHECK:       for.cond.cleanup.loopexit.split.split:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT]]
+; CHECK:       for.cond.cleanup.loopexit.split:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup: