[llvm] r296060 - [CGP] Split some critical edges coming out of indirect branches

Thu Feb 23 16:56:21 PST 2017

Author: mkuper
Date: Thu Feb 23 18:56:21 2017
New Revision: 296060

URL: http://llvm.org/viewvc/llvm-project?rev=296060&view=rev
Log:
[CGP] Split some critical edges coming out of indirect branches

Splitting critical edges when one of the source edges is an indirectbr
is hard in general (because it requires changing the memory the indirectbr
reads). But if a block only has a single indirectbr predecessor (which is
the common case), we can simulate splitting that edge by splitting
the destination block, and retargeting the *direct* branches.

This is motivated by the use of computed gotos in python 2.7: PyEval_EvalFrame()
ends up using an indirect branch with ~100 successors, and passing a constant to
each of those. Since MachineSink can't break indirect critical edges on demand
(and doing this in MIR doesn't look feasible), this causes us to emit about ~100
defs of registers containing constants, which we in the predecessor block, where
only one of those constants is used in each successor. So, at each computed goto,
we needlessly spill about a 100 constants to stack. The end result is that a
clang-compiled python interpreter can be about ~2.5x slower on a simple python
reduction loop than a gcc-compiled interpreter.

Differential Revision: https://reviews.llvm.org/D29916

Added:
    llvm/trunk/test/Transforms/CodeGenPrepare/computedgoto.ll
Modified:
    llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
    llvm/trunk/test/CodeGen/ARM/indirectbr.ll
    llvm/trunk/test/CodeGen/MSP430/indirectbr2.ll
    llvm/trunk/test/CodeGen/PowerPC/indirectbr.ll

Modified: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp?rev=296060&r1=296059&r2=296060&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp (original)
+++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp Thu Feb 23 18:56:21 2017
@@ -15,10 +15,12 @@
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
@@ -53,8 +55,10 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -222,6 +226,7 @@ class TypePromotionTransaction;
                         unsigned CreatedInstCost);
     bool splitBranchCondition(Function &F);
     bool simplifyOffsetableRelocate(Instruction &I);
+    bool splitIndirectCriticalEdges(Function &F);
   };
 }
 
@@ -296,6 +301,10 @@ bool CodeGenPrepare::runOnFunction(Funct
   if (!DisableBranchOpts)
     EverMadeChange |= splitBranchCondition(F);
 
+  // Split some critical edges where one of the sources is an indirect branch,
+  // to help generate sane code for PHIs involving such edges.
+  EverMadeChange |= splitIndirectCriticalEdges(F);
+
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
@@ -429,6 +438,152 @@ BasicBlock *CodeGenPrepare::findDestBloc
   return DestBB;
 }
 
+// Return the unique indirectbr predecessor of a block. This may return null
+// even if such a predecessor exists, if it's not useful for splitting.
+// If a predecessor is found, OtherPreds will contain all other (non-indirectbr)
+// predecessors of BB.
+static BasicBlock *
+findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
+  // If the block doesn't have any PHIs, we don't care about it, since there's
+  // no point in splitting it.
+  PHINode *PN = dyn_cast<PHINode>(BB->begin());
+  if (!PN)
+    return nullptr;
+
+  // Verify we have exactly one IBR predecessor.
+  // Conservatively bail out if one of the other predecessors is not a "regular"
+  // terminator (that is, not a switch or a br).
+  BasicBlock *IBB = nullptr;
+  for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) {
+    BasicBlock *PredBB = PN->getIncomingBlock(Pred);
+    TerminatorInst *PredTerm = PredBB->getTerminator();
+    switch (PredTerm->getOpcode()) {
+    case Instruction::IndirectBr:
+      if (IBB)
+        return nullptr;
+      IBB = PredBB;
+      break;
+    case Instruction::Br:
+    case Instruction::Switch:
+      OtherPreds.push_back(PredBB);
+      continue;
+    default:
+      return nullptr;
+    }
+  }
+
+  return IBB;
+}
+
+// Split critical edges where the source of the edge is an indirectbr
+// instruction. This isn't always possible, but we can handle some easy cases.
+// This is useful because MI is unable to split such critical edges,
+// which means it will not be able to sink instructions along those edges.
+// This is especially painful for indirect branches with many successors, where
+// we end up having to prepare all outgoing values in the origin block.
+//
+// Our normal algorithm for splitting critical edges requires us to update
+// the outgoing edges of the edge origin block, but for an indirectbr this
+// is hard, since it would require finding and updating the block addresses
+// the indirect branch uses. But if a block only has a single indirectbr
+// predecessor, with the others being regular branches, we can do it in a
+// different way.
+// Say we have A -> D, B -> D, I -> D where only I -> D is an indirectbr.
+// We can split D into D0 and D1, where D0 contains only the PHIs from D,
+// and D1 is the D block body. We can then duplicate D0 as D0A and D0B, and
+// create the following structure:
+// A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1
+bool CodeGenPrepare::splitIndirectCriticalEdges(Function &F) {
+  // Check whether the function has any indirectbrs, and collect which blocks
+  // they may jump to. Since most functions don't have indirect branches,
+  // this lowers the common case's overhead to O(Blocks) instead of O(Edges).
+  SmallSetVector<BasicBlock *, 16> Targets;
+  for (auto &BB : F) {
+    auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator());
+    if (!IBI)
+      continue;
+
+    for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ)
+      Targets.insert(IBI->getSuccessor(Succ));
+  }
+
+  if (Targets.empty())
+    return false;
+
+  bool Changed = false;
+  for (BasicBlock *Target : Targets) {
+    SmallVector<BasicBlock *, 16> OtherPreds;
+    BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds);
+    if (!IBRPred)
+      continue;
+
+    // Don't even think about ehpads/landingpads.
+    Instruction *FirstNonPHI = Target->getFirstNonPHI();
+    if (FirstNonPHI->isEHPad() || Target->isLandingPad())
+      continue;
+
+    BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split");
+    // It's possible Target was its own successor through an indirectbr.
+    // In this case, the indirectbr now comes from BodyBlock.
+    if (IBRPred == Target)
+      IBRPred = BodyBlock;
+
+    // At this point Target only has PHIs, and BodyBlock has the rest of the
+    // block's body. Create a copy of Target that will be used by the "direct"
+    // preds.
+    ValueToValueMapTy VMap;
+    BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F);
+
+    for (BasicBlock *Pred : OtherPreds)
+      Pred->getTerminator()->replaceUsesOfWith(Target, DirectSucc);
+
+    // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that
+    // they are clones, so the number of PHIs are the same.
+    // (a) Remove the edge coming from IBRPred from the "Direct" PHI
+    // (b) Leave that as the only edge in the "Indirect" PHI.
+    // (c) Merge the two in the body block.
+    BasicBlock::iterator Indirect = Target->begin(),
+                         End = Target->getFirstNonPHI()->getIterator();
+    BasicBlock::iterator Direct = DirectSucc->begin();
+    BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt();
+
+    assert(&*End == Target->getTerminator() &&
+           "Block was expected to only contain PHIs");
+
+    while (Indirect != End) {
+      PHINode *DirPHI = cast<PHINode>(Direct);
+      PHINode *IndPHI = cast<PHINode>(Indirect);
+
+      // Now, clean up - the direct block shouldn't get the indirect value,
+      // and vice versa.
+      DirPHI->removeIncomingValue(IBRPred);
+      Direct++;
+
+      // Advance the pointer here, to avoid invalidation issues when the old
+      // PHI is erased.
+      Indirect++;
+
+      PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI);
+      NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred),
+                             IBRPred);
+
+      // Create a PHI in the body block, to merge the direct and indirect
+      // predecessors.
+      PHINode *MergePHI =
+          PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert);
+      MergePHI->addIncoming(NewIndPHI, Target);
+      MergePHI->addIncoming(DirPHI, DirectSucc);
+
+      IndPHI->replaceAllUsesWith(MergePHI);
+      IndPHI->eraseFromParent();
+    }
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
 /// Eliminate blocks that contain only PHI nodes, debug info directives, and an
 /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
 /// edges in ways that are non-optimal for isel. Start by eliminating these

Modified: llvm/trunk/test/CodeGen/ARM/indirectbr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/indirectbr.ll?rev=296060&r1=296059&r2=296060&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/indirectbr.ll (original)
+++ llvm/trunk/test/CodeGen/ARM/indirectbr.ll Thu Feb 23 18:56:21 2017
@@ -47,6 +47,7 @@ L3:
   br label %L2
 
 L2:                                               ; preds = %L3, %bb2
+; THUMB-LABEL: %L1.clone
 ; THUMB: muls
   %res.2 = phi i32 [ %res.1, %L3 ], [ 1, %bb2 ]   ; <i32> [#uses=1]
   %phitmp = mul i32 %res.2, 6                     ; <i32> [#uses=1]

Modified: llvm/trunk/test/CodeGen/MSP430/indirectbr2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/MSP430/indirectbr2.ll?rev=296060&r1=296059&r2=296060&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/MSP430/indirectbr2.ll (original)
+++ llvm/trunk/test/CodeGen/MSP430/indirectbr2.ll Thu Feb 23 18:56:21 2017
@@ -5,7 +5,7 @@ define internal i16 @foo(i16 %i) nounwin
 entry:
   %tmp1 = getelementptr inbounds [5 x i8*], [5 x i8*]* @C.0.2070, i16 0, i16 %i ; <i8**> [#uses=1]
   %gotovar.4.0 = load i8*, i8** %tmp1, align 4        ; <i8*> [#uses=1]
-; CHECK: br .LC.0.2070(r12)
+; CHECK: br .LC.0.2070(r15)
   indirectbr i8* %gotovar.4.0, [label %L5, label %L4, label %L3, label %L2, label %L1]
 
 L5:                                               ; preds = %bb2

Modified: llvm/trunk/test/CodeGen/PowerPC/indirectbr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/indirectbr.ll?rev=296060&r1=296059&r2=296060&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/indirectbr.ll (original)
+++ llvm/trunk/test/CodeGen/PowerPC/indirectbr.ll Thu Feb 23 18:56:21 2017
@@ -17,23 +17,35 @@ entry:
 bb2:                                              ; preds = %entry, %bb3
   %gotovar.4.0 = phi i8* [ %gotovar.4.0.pre, %bb3 ], [ %0, %entry ] ; <i8*> [#uses=1]
 ; PIC: mtctr
-; PIC-NEXT: li
-; PIC-NEXT: li
-; PIC-NEXT: li
-; PIC-NEXT: li
 ; PIC-NEXT: bctr
+; PIC: li
+; PIC: b LBB
+; PIC: li
+; PIC: b LBB
+; PIC: li
+; PIC: b LBB
+; PIC: li
+; PIC: b LBB
 ; STATIC: mtctr
-; STATIC-NEXT: li
-; STATIC-NEXT: li
-; STATIC-NEXT: li
-; STATIC-NEXT: li
 ; STATIC-NEXT: bctr
+; STATIC: li
+; STATIC: b LBB
+; STATIC: li
+; STATIC: b LBB
+; STATIC: li
+; STATIC: b LBB
+; STATIC: li
+; STATIC: b LBB
 ; PPC64: mtctr
-; PPC64-NEXT: li
-; PPC64-NEXT: li
-; PPC64-NEXT: li
-; PPC64-NEXT: li
 ; PPC64-NEXT: bctr
+; PPC64: li
+; PPC64: b LBB
+; PPC64: li
+; PPC64: b LBB
+; PPC64: li
+; PPC64: b LBB
+; PPC64: li
+; PPC64: b LBB
   indirectbr i8* %gotovar.4.0, [label %L5, label %L4, label %L3, label %L2, label %L1]
 
 bb3:                                              ; preds = %entry

Added: llvm/trunk/test/Transforms/CodeGenPrepare/computedgoto.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/CodeGenPrepare/computedgoto.ll?rev=296060&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/CodeGenPrepare/computedgoto.ll (added)
+++ llvm/trunk/test/Transforms/CodeGenPrepare/computedgoto.ll Thu Feb 23 18:56:21 2017
@@ -0,0 +1,254 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -codegenprepare -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @use(i32) local_unnamed_addr
+declare void @useptr([2 x i8*]*) local_unnamed_addr
+
+; CHECK: @simple.targets = constant [2 x i8*] [i8* blockaddress(@simple, %bb0), i8* blockaddress(@simple, %bb1)], align 16
+ at simple.targets = constant [2 x i8*] [i8* blockaddress(@simple, %bb0), i8* blockaddress(@simple, %bb1)], align 16
+
+; CHECK: @multi.targets = constant [2 x i8*] [i8* blockaddress(@multi, %bb0), i8* blockaddress(@multi, %bb1)], align 16
+ at multi.targets = constant [2 x i8*] [i8* blockaddress(@multi, %bb0), i8* blockaddress(@multi, %bb1)], align 16
+
+; CHECK: @loop.targets = constant [2 x i8*] [i8* blockaddress(@loop, %bb0), i8* blockaddress(@loop, %bb1)], align 16
+ at loop.targets = constant [2 x i8*] [i8* blockaddress(@loop, %bb0), i8* blockaddress(@loop, %bb1)], align 16
+
+; CHECK: @nophi.targets = constant [2 x i8*] [i8* blockaddress(@nophi, %bb0), i8* blockaddress(@nophi, %bb1)], align 16
+ at nophi.targets = constant [2 x i8*] [i8* blockaddress(@nophi, %bb0), i8* blockaddress(@nophi, %bb1)], align 16
+
+; Check that we break the critical edge when an jump table has only one use.
+define void @simple(i32* nocapture readonly %p) {
+; CHECK-LABEL: @simple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[INITVAL:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[INITOP:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    switch i32 [[INITOP]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0_CLONE:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1_CLONE:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32* [ [[PTR:%.*]], [[BB0:%.*]] ], [ [[INCDEC_PTR]], [[BB0_CLONE]] ]
+; CHECK-NEXT:    [[MERGE2:%.*]] = phi i32 [ 0, [[BB0]] ], [ [[INITVAL]], [[BB0_CLONE]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[MERGE2]])
+; CHECK-NEXT:    br label [[INDIRECTGOTO:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[DOTSPLIT3:%.*]]
+; CHECK:       .split3:
+; CHECK-NEXT:    [[MERGE5:%.*]] = phi i32* [ [[PTR]], [[BB1:%.*]] ], [ [[INCDEC_PTR]], [[BB1_CLONE]] ]
+; CHECK-NEXT:    [[MERGE7:%.*]] = phi i32 [ 1, [[BB1]] ], [ [[INITVAL]], [[BB1_CLONE]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[MERGE7]])
+; CHECK-NEXT:    br label [[INDIRECTGOTO]]
+; CHECK:       indirectgoto:
+; CHECK-NEXT:    [[P_ADDR_SINK:%.*]] = phi i32* [ [[MERGE5]], [[DOTSPLIT3]] ], [ [[MERGE]], [[DOTSPLIT]] ]
+; CHECK-NEXT:    [[PTR]] = getelementptr inbounds i32, i32* [[P_ADDR_SINK]], i64 1
+; CHECK-NEXT:    [[NEWP:%.*]] = load i32, i32* [[P_ADDR_SINK]], align 4
+; CHECK-NEXT:    [[IDX:%.*]] = sext i32 [[NEWP]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @simple.targets, i64 0, i64 [[IDX]]
+; CHECK-NEXT:    [[NEWOP:%.*]] = load i8*, i8** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP]], [label [[BB0]], label %bb1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       bb0.clone:
+; CHECK-NEXT:    br label [[DOTSPLIT]]
+; CHECK:       bb1.clone:
+; CHECK-NEXT:    br label [[DOTSPLIT3]]
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %initval = load i32, i32* %p, align 4
+  %initop = load i32, i32* %incdec.ptr, align 4
+  switch i32 %initop, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  %p.addr.0 = phi i32* [ %incdec.ptr, %entry ], [ %ptr, %indirectgoto ]
+  %opcode.0 = phi i32 [ %initval, %entry ], [ 0, %indirectgoto ]
+  tail call void @use(i32 %opcode.0)
+  br label %indirectgoto
+
+bb1:
+  %p.addr.1 = phi i32* [ %incdec.ptr, %entry ], [ %ptr, %indirectgoto ]
+  %opcode.1 = phi i32 [ %initval, %entry ], [ 1, %indirectgoto ]
+  tail call void @use(i32 %opcode.1)
+  br label %indirectgoto
+
+indirectgoto:
+  %p.addr.sink = phi i32* [ %p.addr.1, %bb1 ], [ %p.addr.0, %bb0 ]
+  %ptr = getelementptr inbounds i32, i32* %p.addr.sink, i64 1
+  %newp = load i32, i32* %p.addr.sink, align 4
+  %idx = sext i32 %newp to i64
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @simple.targets, i64 0, i64 %idx
+  %newop = load i8*, i8** %arrayidx, align 8
+  indirectbr i8* %newop, [label %bb0, label %bb1]
+
+exit:
+  ret void
+}
+
+; Don't try to break critical edges when several indirectbr point to a single block
+define void @multi(i32* nocapture readonly %p) {
+; CHECK-LABEL: @multi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[INITVAL:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[INITOP:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    switch i32 [[INITOP]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[P_ADDR_0:%.*]] = phi i32* [ [[INCDEC_PTR]], [[ENTRY:%.*]] ], [ [[NEXT0:%.*]], [[BB0]] ], [ [[NEXT1:%.*]], [[BB1]] ]
+; CHECK-NEXT:    [[OPCODE_0:%.*]] = phi i32 [ [[INITVAL]], [[ENTRY]] ], [ 0, [[BB0]] ], [ 1, [[BB1]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[OPCODE_0]])
+; CHECK-NEXT:    [[NEXT0]] = getelementptr inbounds i32, i32* [[P_ADDR_0]], i64 1
+; CHECK-NEXT:    [[NEWP0:%.*]] = load i32, i32* [[P_ADDR_0]], align 4
+; CHECK-NEXT:    [[IDX0:%.*]] = sext i32 [[NEWP0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 [[IDX0]]
+; CHECK-NEXT:    [[NEWOP0:%.*]] = load i8*, i8** [[ARRAYIDX0]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP0]], [label [[BB0]], label %bb1]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[P_ADDR_1:%.*]] = phi i32* [ [[INCDEC_PTR]], [[ENTRY]] ], [ [[NEXT0]], [[BB0]] ], [ [[NEXT1]], [[BB1]] ]
+; CHECK-NEXT:    [[OPCODE_1:%.*]] = phi i32 [ [[INITVAL]], [[ENTRY]] ], [ 0, [[BB0]] ], [ 1, [[BB1]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[OPCODE_1]])
+; CHECK-NEXT:    [[NEXT1]] = getelementptr inbounds i32, i32* [[P_ADDR_1]], i64 1
+; CHECK-NEXT:    [[NEWP1:%.*]] = load i32, i32* [[P_ADDR_1]], align 4
+; CHECK-NEXT:    [[IDX1:%.*]] = sext i32 [[NEWP1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 [[IDX1]]
+; CHECK-NEXT:    [[NEWOP1:%.*]] = load i8*, i8** [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP1]], [label [[BB0]], label %bb1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %initval = load i32, i32* %p, align 4
+  %initop = load i32, i32* %incdec.ptr, align 4
+  switch i32 %initop, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  %p.addr.0 = phi i32* [ %incdec.ptr, %entry ], [ %next0, %bb0 ], [ %next1, %bb1 ]
+  %opcode.0 = phi i32 [ %initval, %entry ], [ 0, %bb0 ], [ 1, %bb1 ]
+  tail call void @use(i32 %opcode.0)
+  %next0 = getelementptr inbounds i32, i32* %p.addr.0, i64 1
+  %newp0 = load i32, i32* %p.addr.0, align 4
+  %idx0 = sext i32 %newp0 to i64
+  %arrayidx0 = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 %idx0
+  %newop0 = load i8*, i8** %arrayidx0, align 8
+  indirectbr i8* %newop0, [label %bb0, label %bb1]
+
+bb1:
+  %p.addr.1 = phi i32* [ %incdec.ptr, %entry ], [ %next0, %bb0 ], [ %next1, %bb1 ]
+  %opcode.1 = phi i32 [ %initval, %entry ], [ 0, %bb0 ], [ 1, %bb1 ]
+  tail call void @use(i32 %opcode.1)
+  %next1 = getelementptr inbounds i32, i32* %p.addr.1, i64 1
+  %newp1 = load i32, i32* %p.addr.1, align 4
+  %idx1 = sext i32 %newp1 to i64
+  %arrayidx1 = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 %idx1
+  %newop1 = load i8*, i8** %arrayidx1, align 8
+  indirectbr i8* %newop1, [label %bb0, label %bb1]
+
+exit:
+  ret void
+}
+
+; Make sure we do the right thing for cases where the indirectbr branches to
+; the block it terminates.
+define void @loop(i64* nocapture readonly %p) {
+; CHECK-LABEL: @loop(
+; CHECK-NEXT:  bb0.clone:
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[DOTSPLIT]]
+; CHECK:       .split:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[BB0:%.*]] ], [ 0, [[BB0_CLONE:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[MERGE]]
+; CHECK-NEXT:    store i64 [[MERGE]], i64* [[TMP0]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[MERGE]], 1
+; CHECK-NEXT:    [[IDX:%.*]] = srem i64 [[MERGE]], 2
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @loop.targets, i64 0, i64 [[IDX]]
+; CHECK-NEXT:    [[TARGET:%.*]] = load i8*, i8** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    indirectbr i8* [[TARGET]], [label [[BB0]], label %bb1]
+; CHECK:       bb1:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %bb0
+
+bb0:
+  %i = phi i64 [ %i.next, %bb0 ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i64, i64* %p, i64 %i
+  store i64 %i, i64* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %idx = srem i64 %i, 2
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @loop.targets, i64 0, i64 %idx
+  %target = load i8*, i8** %arrayidx, align 8
+  indirectbr i8* %target, [label %bb0, label %bb1]
+
+bb1:
+  ret void
+}
+
+; Don't do anything for cases that contain no phis.
+define void @nophi(i32* %p) {
+; CHECK-LABEL: @nophi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[INITOP:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    switch i32 [[INITOP]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    tail call void @use(i32 0)
+; CHECK-NEXT:    br label [[INDIRECTGOTO:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    tail call void @use(i32 1)
+; CHECK-NEXT:    br label [[INDIRECTGOTO]]
+; CHECK:       indirectgoto:
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = ptrtoint i32* [[P]] to i64
+; CHECK-NEXT:    [[SUNKADDR1:%.*]] = add i64 [[SUNKADDR]], 4
+; CHECK-NEXT:    [[SUNKADDR2:%.*]] = inttoptr i64 [[SUNKADDR1]] to i32*
+; CHECK-NEXT:    [[NEWP:%.*]] = load i32, i32* [[SUNKADDR2]], align 4
+; CHECK-NEXT:    [[IDX:%.*]] = sext i32 [[NEWP]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @nophi.targets, i64 0, i64 [[IDX]]
+; CHECK-NEXT:    [[NEWOP:%.*]] = load i8*, i8** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP]], [label [[BB0]], label %bb1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %initop = load i32, i32* %incdec.ptr, align 4
+  switch i32 %initop, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  tail call void @use(i32 0)
+  br label %indirectgoto
+
+bb1:
+  tail call void @use(i32 1)
+  br label %indirectgoto
+
+indirectgoto:
+  %newp = load i32, i32* %incdec.ptr, align 4
+  %idx = sext i32 %newp to i64
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @nophi.targets, i64 0, i64 %idx
+  %newop = load i8*, i8** %arrayidx, align 8
+  indirectbr i8* %newop, [label %bb0, label %bb1]
+
+exit:
+  ret void
+}