[llvm] c172f1a - [IROutliner] Adding supports for multiple exits

Wed Sep 8 09:08:43 PDT 2021

Author: Andrew Litteken
Date: 2021-09-08T08:58:07-07:00
New Revision: c172f1ad39cbc1f4b3aa2d57c18dbe4c986151b9

URL: https://github.com/llvm/llvm-project/commit/c172f1ad39cbc1f4b3aa2d57c18dbe4c986151b9
DIFF: https://github.com/llvm/llvm-project/commit/c172f1ad39cbc1f4b3aa2d57c18dbe4c986151b9.diff

LOG: [IROutliner] Adding supports for multiple exits

When we start outlining across branches, there is the possibility that we will have two different blocks with different output locations, or a single branch that goes to two blocks outside of the region that is being outlined. While the CodeExtractor provides most of the mechanisms by using the return value of the extracted function as the input to a switch statement to correctly branch to the correct location, we need special handling for different output schemas to each location.

This is done by repeating the existing storing scheme for each different exit block. We have a map from the return values used, to the basic block that is used to store the outputs for that particular exit block within the outlined function. Then if needed, we create a switch statement for each return block to branch to the correct set of stored outputs.

Reviewers: paquette

Differential Revision: https://reviews.llvm.org/D106993

Added: 
    llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll
    llvm/test/Transforms/IROutliner/outlining-multiple-exits-diff-outputs.ll
    llvm/test/Transforms/IROutliner/outlining-multiple-exits-one-output-set.ll

Modified: 
    llvm/lib/Transforms/IPO/IROutliner.cpp
    llvm/test/Transforms/IROutliner/outlining-multiple-exits.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index 5bb4816c16237..9b563d4a7cba3 100644

--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
@@ -75,8 +76,12 @@ struct OutlinableGroup {
   /// for extraction.
   bool IgnoreGroup = false;
 
-  /// The return block for the overall function.
-  BasicBlock *EndBB = nullptr;
+  /// The return blocks for the overall function.
+  DenseMap<Value *, BasicBlock *> EndBBs;
+
+  /// The PHIBlocks with their corresponding return block based on the return
+  /// value as the key.
+  DenseMap<Value *, BasicBlock *> PHIBlocks;
 
   /// A set containing the 
diff erent GVN store sets needed. Each array contains
   /// a sorted list of the 
diff erent values that need to be stored into output
@@ -138,6 +143,26 @@ static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB) {
   }
 }
 
+/// A function to sort the keys of \p Map, which must be a mapping of constant
+/// values to basic blocks and return it in \p SortedKeys
+///
+/// \param SortedKeys - The vector the keys will be return in and sorted.
+/// \param Map - The DenseMap containing keys to sort.
+static void getSortedConstantKeys(std::vector<Value *> &SortedKeys,
+                                  DenseMap<Value *, BasicBlock *> &Map) {
+  for (auto &VtoBB : Map)
+    SortedKeys.push_back(VtoBB.first);
+
+  stable_sort(SortedKeys, [](const Value *LHS, const Value *RHS) {
+    const ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS);
+    const ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS);
+    assert(RHSC && "Not a constant integer in return value?");
+    assert(LHSC && "Not a constant integer in return value?");
+
+    return LHSC->getLimitedValue() < RHSC->getLimitedValue();
+  });
+}
+
 void OutlinableRegion::splitCandidate() {
   assert(!CandidateSplit && "Candidate already split!");
 
@@ -416,8 +441,24 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group,
                                      unsigned FunctionNameSuffix) {
   assert(!Group.OutlinedFunction && "Function is already defined!");
 
+  Type *RetTy = Type::getVoidTy(M.getContext());
+  // All extracted functions _should_ have the same return type at this point
+  // since the similarity identifier ensures that all branches outside of the
+  // region occur in the same place.
+
+  // NOTE: Should we ever move to the model that uses a switch at every point
+  // needed, meaning that we could branch within the region or out, it is
+  // possible that we will need to switch to using the most general case all of
+  // the time.
+  for (OutlinableRegion *R : Group.Regions) {
+    Type *ExtractedFuncType = R->ExtractedFunction->getReturnType();
+    if ((RetTy->isVoidTy() && !ExtractedFuncType->isVoidTy()) ||
+        (RetTy->isIntegerTy(1) && ExtractedFuncType->isIntegerTy(16)))
+      RetTy = ExtractedFuncType;
+  }
+
   Group.OutlinedFunctionType = FunctionType::get(
-      Type::getVoidTy(M.getContext()), Group.ArgumentTypes, false);
+      RetTy, Group.ArgumentTypes, false);
 
   // These functions will only be called from within the same module, so
   // we can set an internal linkage.
@@ -473,18 +514,23 @@ Function *IROutliner::createFunction(Module &M, OutlinableGroup &Group,
 ///
 /// \param [in] Old - The function to move the basic blocks from.
 /// \param [in] New - The function to move the basic blocks to.
+/// \param [out] NewEnds - The return blocks of the new overall function.
 /// \returns the first return block for the function in New.
-static BasicBlock *moveFunctionData(Function &Old, Function &New) {
+static void moveFunctionData(Function &Old, Function &New,
+                             DenseMap<Value *, BasicBlock *> &NewEnds) {
   Function::iterator CurrBB, NextBB, FinalBB;
-  BasicBlock *NewEnd = nullptr;
   for (CurrBB = Old.begin(), FinalBB = Old.end(); CurrBB != FinalBB;
        CurrBB = NextBB) {
     NextBB = std::next(CurrBB);
     CurrBB->removeFromParent();
     CurrBB->insertInto(&New);
     Instruction *I = CurrBB->getTerminator();
-    if (isa<ReturnInst>(I))
-      NewEnd = &(*CurrBB);
+
+    // For each block we find a return instruction is, it is a potential exit
+    // path for the function.  We keep track of each block based on the return
+    // value here.
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(I))
+      NewEnds.insert(std::make_pair(RI->getReturnValue(), &(*CurrBB)));
 
     std::vector<Instruction *> DebugInsts;
 
@@ -520,8 +566,7 @@ static BasicBlock *moveFunctionData(Function &Old, Function &New) {
       I->eraseFromParent();
   }
 
-  assert(NewEnd && "No return instruction for new function?");
-  return NewEnd;
+  assert(NewEnds.size() > 0 && "No return instruction for new function?");
 }
 
 /// Find the the constants that will need to be lifted into arguments
@@ -805,17 +850,9 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
       if (!BBSet.contains(Succ))
         Exits.insert(Succ);
 
-  // For now, we check whether we have more than one exit, if we do, we
-  // ignore this region.
-  if (Exits.size() > 1) {
-    Region.IgnoreRegion = true;
-    return;
-  }
-
   // After determining which blocks exit to PHINodes, we add these PHINodes to
   // the set of outputs to be processed.  We also check the incoming values of
   // the PHINodes for whether they should no longer be considered outputs.
-  DenseSet<Value *> PHIWrapped;
   for (BasicBlock *ExitBB : Exits) {
     for (PHINode &PN : ExitBB->phis()) {
       // Find all incoming values from the outlining region.
@@ -1007,6 +1044,9 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
 
   // Transfer any debug information.
   Call->setDebugLoc(Region.Call->getDebugLoc());
+  // Since our output may determine which branch we go to, we make sure to
+  // propogate this new call value through the module.
+  OldCall->replaceAllUsesWith(Call);
 
   // Remove the old instruction.
   OldCall->eraseFromParent();
@@ -1027,11 +1067,21 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
 /// \param [in] Region - The region of extracted code to be changed.
 /// \param [in,out] OutputBB - The BasicBlock for the output stores for this
 /// region.
-static void replaceArgumentUses(OutlinableRegion &Region,
-                                BasicBlock *OutputBB) {
+/// \param [in] FirstFunction - A flag to indicate whether we are using this
+/// function to define the overall outlined function for all the regions, or
+/// if we are operating on one of the following regions.
+static void
+replaceArgumentUses(OutlinableRegion &Region,
+                    DenseMap<Value *, BasicBlock *> &OutputBBs,
+                    bool FirstFunction = false) {
   OutlinableGroup &Group = *Region.Parent;
   assert(Region.ExtractedFunction && "Region has no extracted function?");
 
+  Function *DominatingFunction = Region.ExtractedFunction;
+  if (FirstFunction)
+    DominatingFunction = Group.OutlinedFunction;
+  DominatorTree DT(*DominatingFunction);
+
   for (unsigned ArgIdx = 0; ArgIdx < Region.ExtractedFunction->arg_size();
        ArgIdx++) {
     assert(Region.ExtractedArgToAgg.find(ArgIdx) !=
@@ -1058,11 +1108,41 @@ static void replaceArgumentUses(OutlinableRegion &Region,
     assert(InstAsUser && "User is nullptr!");
 
     Instruction *I = cast<Instruction>(InstAsUser);
-    I->setDebugLoc(DebugLoc());
-    LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
-                      << *OutputBB << "\n");
+    BasicBlock *BB = I->getParent();
+    SmallVector<BasicBlock *, 4> Descendants;
+    DT.getDescendants(BB, Descendants);
+    bool EdgeAdded = false;
+    if (Descendants.size() == 0) {
+      EdgeAdded = true;
+      DT.insertEdge(&DominatingFunction->getEntryBlock(), BB);
+      DT.getDescendants(BB, Descendants);
+    }
 
-    I->moveBefore(*OutputBB, OutputBB->end());
+    // Iterate over the following blocks, looking for return instructions,
+    // if we find one, find the corresponding output block for the return value
+    // and move our store instruction there.
+    for (BasicBlock *DescendBB : Descendants) {
+      ReturnInst *RI = dyn_cast<ReturnInst>(DescendBB->getTerminator());
+      if (!RI)
+        continue;
+      Value *RetVal = RI->getReturnValue();
+      DenseMap<Value *, BasicBlock *>::iterator VBBIt, PHIVBBIt;
+      VBBIt = OutputBBs.find(RetVal);
+      assert(VBBIt != OutputBBs.end() && "Could not find output value!");
+
+      Instruction *NewI = I->clone();
+      NewI->setDebugLoc(DebugLoc());
+      BasicBlock *OutputBB = VBBIt->second;
+      OutputBB->getInstList().push_back(NewI);
+      LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
+                        << *OutputBB << "\n");
+    }
+
+    // If we added an edge for basic blocks without a predecessor, we remove it
+    // here.
+    if (EdgeAdded)
+      DT.deleteEdge(&DominatingFunction->getEntryBlock(), BB);
+    I->eraseFromParent();
 
     LLVM_DEBUG(dbgs() << "Replacing uses of output " << *Arg << " in function "
                       << *Region.ExtractedFunction << " with " << *AggArg
@@ -1136,35 +1216,47 @@ collectRelevantInstructions(Function &F,
 /// \param OutputBB [in] the block we are looking for a duplicate of.
 /// \param OutputStoreBBs [in] The existing output blocks.
 /// \returns an optional value with the number output block if there is a match.
-Optional<unsigned>
-findDuplicateOutputBlock(BasicBlock *OutputBB,
-                         ArrayRef<BasicBlock *> OutputStoreBBs) {
+Optional<unsigned> findDuplicateOutputBlock(
+    DenseMap<Value *, BasicBlock *> &OutputBBs,
+    std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
 
-  bool WrongInst = false;
-  bool WrongSize = false;
+  bool Mismatch = false;
   unsigned MatchingNum = 0;
-  for (BasicBlock *CompBB : OutputStoreBBs) {
-    WrongInst = false;
-    if (CompBB->size() - 1 != OutputBB->size()) {
-      WrongSize = true;
-      MatchingNum++;
-      continue;
-    }
-
-    WrongSize = false;
-    BasicBlock::iterator NIt = OutputBB->begin();
-    for (Instruction &I : *CompBB) {
-      if (isa<BranchInst>(&I))
-        continue;
+  // We compare the new set output blocks to the other sets of output blocks.
+  // If they are the same number, and have identical instructions, they are
+  // considered to be the same.
+  for (DenseMap<Value *, BasicBlock *> &CompBBs : OutputStoreBBs) {
+    Mismatch = false;
+    for (std::pair<Value *, BasicBlock *> &VToB : CompBBs) {
+      DenseMap<Value *, BasicBlock *>::iterator OutputBBIt =
+          OutputBBs.find(VToB.first);
+      if (OutputBBIt == OutputBBs.end()) {
+        Mismatch = true;
+        break;
+      }
 
-      if (!I.isIdenticalTo(&(*NIt))) {
-        WrongInst = true;
+      BasicBlock *CompBB = VToB.second;
+      BasicBlock *OutputBB = OutputBBIt->second;
+      if (CompBB->size() - 1 != OutputBB->size()) {
+        Mismatch = true;
         break;
       }
 
-      NIt++;
+      BasicBlock::iterator NIt = OutputBB->begin();
+      for (Instruction &I : *CompBB) {
+        if (isa<BranchInst>(&I))
+          continue;
+
+        if (!I.isIdenticalTo(&(*NIt))) {
+          Mismatch = true;
+          break;
+        }
+
+        NIt++;
+      }
     }
-    if (!WrongInst && !WrongSize)
+
+    if (!Mismatch)
       return MatchingNum;
 
     MatchingNum++;
@@ -1173,6 +1265,47 @@ findDuplicateOutputBlock(BasicBlock *OutputBB,
   return None;
 }
 
+/// Remove empty output blocks from the outlined region.
+///
+/// \param BlocksToPrune - Mapping of return values output blocks for the \p
+/// Region.
+/// \param Region - The OutlinableRegion we are analyzing.
+static bool
+analyzeAndPruneOutputBlocks(DenseMap<Value *, BasicBlock *> &BlocksToPrune,
+                            OutlinableRegion &Region) {
+  bool AllRemoved = true;
+  Value *RetValueForBB;
+  BasicBlock *NewBB;
+  SmallVector<Value *, 4> ToRemove;
+  // Iterate over the output blocks created in the outlined section.
+  for (std::pair<Value *, BasicBlock *> &VtoBB : BlocksToPrune) {
+    RetValueForBB = VtoBB.first;
+    NewBB = VtoBB.second;
+  
+    // If there are no instructions, we remove it from the module, and also
+    // mark the value for removal from the return value to output block mapping.
+    if (NewBB->size() == 0) {
+      NewBB->eraseFromParent();
+      ToRemove.push_back(RetValueForBB);
+      continue;
+    }
+    
+    // Mark that we could not remove all the blocks since they were not all
+    // empty.
+    AllRemoved = false;
+  }
+
+  // Remove the return value from the mapping.
+  for (Value *V : ToRemove)
+    BlocksToPrune.erase(V);
+
+  // Mark the region as having the no output scheme.
+  if (AllRemoved)
+    Region.OutputBlockNum = -1;
+  
+  return AllRemoved;
+}
+
 /// For the outlined section, move needed the StoreInsts for the output
 /// registers into their own block. Then, determine if there is a duplicate
 /// output block already created.
@@ -1185,11 +1318,12 @@ findDuplicateOutputBlock(BasicBlock *OutputBB,
 /// \param [in] OutputMappings - OutputMappings the mapping of values that have
 /// been replaced by a new output value.
 /// \param [in,out] OutputStoreBBs - The existing output blocks.
-static void
-alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region,
-                            BasicBlock *OutputBB, BasicBlock *EndBB,
-                            const DenseMap<Value *, Value *> &OutputMappings,
-                            std::vector<BasicBlock *> &OutputStoreBBs) {
+static void alignOutputBlockWithAggFunc(
+    OutlinableGroup &OG, OutlinableRegion &Region,
+    DenseMap<Value *, BasicBlock *> &OutputBBs,
+    DenseMap<Value *, BasicBlock *> &EndBBs,
+    const DenseMap<Value *, Value *> &OutputMappings,
+    std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
   DenseSet<unsigned> ValuesToFind(Region.GVNStores.begin(),
                                   Region.GVNStores.end());
 
@@ -1198,9 +1332,13 @@ alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region,
   // be contained in a store, we replace the uses of the value with the value
   // from the overall function, so that the store is storing the correct
   // value from the overall function.
-  DenseSet<BasicBlock *> ExcludeBBs(OutputStoreBBs.begin(),
-                                    OutputStoreBBs.end());
-  ExcludeBBs.insert(OutputBB);
+  DenseSet<BasicBlock *> ExcludeBBs;
+  for (DenseMap<Value *, BasicBlock *> &BBMap : OutputStoreBBs)
+    for (std::pair<Value *, BasicBlock *> &VBPair : BBMap)
+      ExcludeBBs.insert(VBPair.second);
+  for (std::pair<Value *, BasicBlock *> &VBPair : OutputBBs)
+    ExcludeBBs.insert(VBPair.second);
+
   std::vector<Instruction *> ExtractedFunctionInsts =
       collectRelevantInstructions(*(Region.ExtractedFunction), ExcludeBBs);
   std::vector<Instruction *> OverallFunctionInsts =
@@ -1231,37 +1369,71 @@ alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region,
 
   assert(ValuesToFind.size() == 0 && "Not all store values were handled!");
 
-  // If the size of the block is 0, then there are no stores, and we do not
-  // need to save this block.
-  if (OutputBB->size() == 0) {
-    Region.OutputBlockNum = -1;
-    OutputBB->eraseFromParent();
+  // If none of the output blocks have any instructions, this means that we do
+  // not have to determine if it matches any of the other output schemes, and we
+  // don't have to do anything else.
+  if (analyzeAndPruneOutputBlocks(OutputBBs, Region))
     return;
-  }
 
-  // Determine is there is a duplicate block.
+  // Determine is there is a duplicate set of blocks.
   Optional<unsigned> MatchingBB =
-      findDuplicateOutputBlock(OutputBB, OutputStoreBBs);
+      findDuplicateOutputBlock(OutputBBs, OutputStoreBBs);
 
-  // If there is, we remove the new output block.  If it does not,
-  // we add it to our list of output blocks.
+  // If there is, we remove the new output blocks.  If it does not,
+  // we add it to our list of sets of output blocks.
   if (MatchingBB.hasValue()) {
     LLVM_DEBUG(dbgs() << "Set output block for region in function"
                       << Region.ExtractedFunction << " to "
                       << MatchingBB.getValue());
 
     Region.OutputBlockNum = MatchingBB.getValue();
-    OutputBB->eraseFromParent();
+    for (std::pair<Value *, BasicBlock *> &VtoBB : OutputBBs)
+      VtoBB.second->eraseFromParent();
     return;
   }
 
   Region.OutputBlockNum = OutputStoreBBs.size();
 
-  LLVM_DEBUG(dbgs() << "Create output block for region in"
-                    << Region.ExtractedFunction << " to "
-                    << *OutputBB);
-  OutputStoreBBs.push_back(OutputBB);
-  BranchInst::Create(EndBB, OutputBB);
+  Value *RetValueForBB;
+  BasicBlock *NewBB;
+  OutputStoreBBs.push_back(DenseMap<Value *, BasicBlock *>());
+  for (std::pair<Value *, BasicBlock *> &VtoBB : OutputBBs) {
+    RetValueForBB = VtoBB.first;
+    NewBB = VtoBB.second;
+    DenseMap<Value *, BasicBlock *>::iterator VBBIt =
+        EndBBs.find(RetValueForBB);
+    LLVM_DEBUG(dbgs() << "Create output block for region in"
+                      << Region.ExtractedFunction << " to "
+                      << *NewBB);
+    BranchInst::Create(VBBIt->second, NewBB);
+    OutputStoreBBs.back().insert(std::make_pair(RetValueForBB, NewBB));
+  }
+}
+
+/// Takes in a mapping, \p OldMap of ConstantValues to BasicBlocks, sorts keys,
+/// before creating a basic block for each \p NewMap, and inserting into the new
+/// block. Each BasicBlock is named with the scheme "<basename>_<key_idx>".
+///
+/// \param OldMap [in] - The mapping to base the new mapping off of.
+/// \param NewMap [out] - The output mapping using the keys of \p OldMap.
+/// \param ParentFunc [in] - The function to put the new basic block in.
+/// \param BaseName [in] - The start of the BasicBlock names to be appended to
+/// by an index value.
+static void createAndInsertBasicBlocks(DenseMap<Value *, BasicBlock *> &OldMap,
+                                       DenseMap<Value *, BasicBlock *> &NewMap,
+                                       Function *ParentFunc, Twine BaseName) {
+  unsigned Idx = 0;
+  std::vector<Value *> SortedKeys;
+  
+  getSortedConstantKeys(SortedKeys, OldMap);
+
+  for (Value *RetVal : SortedKeys) {
+    BasicBlock *NewBB = BasicBlock::Create(
+        ParentFunc->getContext(),
+        Twine(BaseName) + Twine("_") + Twine(static_cast<unsigned>(Idx++)),
+        ParentFunc);
+    NewMap.insert(std::make_pair(RetVal, NewBB));
+  }
 }
 
 /// Create the switch statement for outlined function to 
diff erentiate between
@@ -1273,48 +1445,72 @@ alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region,
 /// \param [in] OG - The group of regions to be outlined.
 /// \param [in] EndBB - The final block of the extracted function.
 /// \param [in,out] OutputStoreBBs - The existing output blocks.
-void createSwitchStatement(Module &M, OutlinableGroup &OG, BasicBlock *EndBB,
-                           ArrayRef<BasicBlock *> OutputStoreBBs) {
+void createSwitchStatement(
+    Module &M, OutlinableGroup &OG, DenseMap<Value *, BasicBlock *> &EndBBs,
+    std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
   // We only need the switch statement if there is more than one store
   // combination.
   if (OG.OutputGVNCombinations.size() > 1) {
     Function *AggFunc = OG.OutlinedFunction;
-    // Create a final block
-    BasicBlock *ReturnBlock =
-        BasicBlock::Create(M.getContext(), "final_block", AggFunc);
-    Instruction *Term = EndBB->getTerminator();
-    Term->moveBefore(*ReturnBlock, ReturnBlock->end());
-    // Put the switch statement in the old end basic block for the function with
-    // a fall through to the new return block
-    LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for "
-                      << OutputStoreBBs.size() << "\n");
-    SwitchInst *SwitchI =
-        SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1),
-                           ReturnBlock, OutputStoreBBs.size(), EndBB);
-
-    unsigned Idx = 0;
-    for (BasicBlock *BB : OutputStoreBBs) {
-      SwitchI->addCase(ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx),
-                       BB);
-      Term = BB->getTerminator();
-      Term->setSuccessor(0, ReturnBlock);
-      Idx++;
+    // Create a final block for each 
diff erent return block.
+    DenseMap<Value *, BasicBlock *> ReturnBBs;
+    createAndInsertBasicBlocks(OG.EndBBs, ReturnBBs, AggFunc, "final_block");
+
+    for (std::pair<Value *, BasicBlock *> &RetBlockPair : ReturnBBs) {
+      std::pair<Value *, BasicBlock *> &OutputBlock =
+          *OG.EndBBs.find(RetBlockPair.first);
+      BasicBlock *ReturnBlock = RetBlockPair.second;
+      BasicBlock *EndBB = OutputBlock.second;
+      Instruction *Term = EndBB->getTerminator();
+      // Move the return value to the final block instead of the original exit
+      // stub.
+      Term->moveBefore(*ReturnBlock, ReturnBlock->end());
+      // Put the switch statement in the old end basic block for the function
+      // with a fall through to the new return block.
+      LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for "
+                        << OutputStoreBBs.size() << "\n");
+      SwitchInst *SwitchI =
+          SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1),
+                             ReturnBlock, OutputStoreBBs.size(), EndBB);
+
+      unsigned Idx = 0;
+      for (DenseMap<Value *, BasicBlock *> &OutputStoreBB : OutputStoreBBs) {
+        DenseMap<Value *, BasicBlock *>::iterator OSBBIt =
+            OutputStoreBB.find(OutputBlock.first);
+
+        if (OSBBIt == OutputStoreBB.end())
+          continue;
+
+        BasicBlock *BB = OSBBIt->second;
+        SwitchI->addCase(
+            ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx), BB);
+        Term = BB->getTerminator();
+        Term->setSuccessor(0, ReturnBlock);
+        Idx++;
+      }
     }
     return;
   }
 
-  // If there needs to be stores, move them from the output block to the end
-  // block to save on branching instructions.
+  // If there needs to be stores, move them from the output blocks to their
+  // corresponding ending block.
   if (OutputStoreBBs.size() == 1) {
     LLVM_DEBUG(dbgs() << "Move store instructions to the end block in "
                       << *OG.OutlinedFunction << "\n");
-    BasicBlock *OutputBlock = OutputStoreBBs[0];
-    Instruction *Term = OutputBlock->getTerminator();
-    Term->eraseFromParent();
-    Term = EndBB->getTerminator();
-    moveBBContents(*OutputBlock, *EndBB);
-    Term->moveBefore(*EndBB, EndBB->end());
-    OutputBlock->eraseFromParent();
+    DenseMap<Value *, BasicBlock *> OutputBlocks = OutputStoreBBs[0];
+    for (std::pair<Value *, BasicBlock *> &VBPair : OutputBlocks) {
+      DenseMap<Value *, BasicBlock *>::iterator EndBBIt =
+          EndBBs.find(VBPair.first);
+      assert(EndBBIt != EndBBs.end() && "Could not find end block");
+      BasicBlock *EndBB = EndBBIt->second;
+      BasicBlock *OutputBB = VBPair.second;
+      Instruction *Term = OutputBB->getTerminator();
+      Term->eraseFromParent();
+      Term = EndBB->getTerminator();
+      moveBBContents(*OutputBB, *EndBB);
+      Term->moveBefore(*EndBB, EndBB->end());
+      OutputBB->eraseFromParent();
+    }
   }
 }
 
@@ -1329,41 +1525,44 @@ void createSwitchStatement(Module &M, OutlinableGroup &OG, BasicBlock *EndBB,
 /// set of stores needed for the 
diff erent functions.
 /// \param [in,out] FuncsToRemove - Extracted functions to erase from module
 /// once outlining is complete.
-static void fillOverallFunction(Module &M, OutlinableGroup &CurrentGroup,
-                                std::vector<BasicBlock *> &OutputStoreBBs,
-                                std::vector<Function *> &FuncsToRemove) {
+static void fillOverallFunction(
+    Module &M, OutlinableGroup &CurrentGroup,
+    std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs,
+    std::vector<Function *> &FuncsToRemove) {
   OutlinableRegion *CurrentOS = CurrentGroup.Regions[0];
 
   // Move first extracted function's instructions into new function.
   LLVM_DEBUG(dbgs() << "Move instructions from "
                     << *CurrentOS->ExtractedFunction << " to instruction "
                     << *CurrentGroup.OutlinedFunction << "\n");
-
-  CurrentGroup.EndBB = moveFunctionData(*CurrentOS->ExtractedFunction,
-                                        *CurrentGroup.OutlinedFunction);
+  moveFunctionData(*CurrentOS->ExtractedFunction,
+                   *CurrentGroup.OutlinedFunction, CurrentGroup.EndBBs);
 
   // Transfer the attributes from the function to the new function.
   for (Attribute A : CurrentOS->ExtractedFunction->getAttributes().getFnAttrs())
     CurrentGroup.OutlinedFunction->addFnAttr(A);
 
-  // Create an output block for the first extracted function.
-  BasicBlock *NewBB = BasicBlock::Create(
-      M.getContext(), Twine("output_block_") + Twine(static_cast<unsigned>(0)),
-      CurrentGroup.OutlinedFunction);
+  // Create a new set of output blocks for the first extracted function.
+  DenseMap<Value *, BasicBlock *> NewBBs;
+  createAndInsertBasicBlocks(CurrentGroup.EndBBs, NewBBs,
+                             CurrentGroup.OutlinedFunction, "output_block_0");
   CurrentOS->OutputBlockNum = 0;
 
-  replaceArgumentUses(*CurrentOS, NewBB);
+  replaceArgumentUses(*CurrentOS, NewBBs, true);
   replaceConstants(*CurrentOS);
 
-  // If the new basic block has no new stores, we can erase it from the module.
-  // It it does, we create a branch instruction to the last basic block from the
-  // new one.
-  if (NewBB->size() == 0) {
-    CurrentOS->OutputBlockNum = -1;
-    NewBB->eraseFromParent();
-  } else {
-    BranchInst::Create(CurrentGroup.EndBB, NewBB);
-    OutputStoreBBs.push_back(NewBB);
+  // We first identify if any output blocks are empty, if they are we remove
+  // them. We then create a branch instruction to the basic block to the return
+  // block for the function for each non empty output block.
+  if (!analyzeAndPruneOutputBlocks(NewBBs, *CurrentOS)) {
+    OutputStoreBBs.push_back(DenseMap<Value *, BasicBlock *>());
+    for (std::pair<Value *, BasicBlock *> &VToBB : NewBBs) {
+      DenseMap<Value *, BasicBlock *>::iterator VBBIt =
+          CurrentGroup.EndBBs.find(VToBB.first);
+      BasicBlock *EndBB = VBBIt->second;
+      BranchInst::Create(EndBB, VToBB.second);
+      OutputStoreBBs.back().insert(VToBB);
+    }
   }
 
   // Replace the call to the extracted function with the outlined function.
@@ -1379,25 +1578,28 @@ void IROutliner::deduplicateExtractedSections(
     std::vector<Function *> &FuncsToRemove, unsigned &OutlinedFunctionNum) {
   createFunction(M, CurrentGroup, OutlinedFunctionNum);
 
-  std::vector<BasicBlock *> OutputStoreBBs;
+  std::vector<DenseMap<Value *, BasicBlock *>> OutputStoreBBs;
 
   OutlinableRegion *CurrentOS;
 
   fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove);
 
+  std::vector<Value *> SortedKeys;
   for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) {
     CurrentOS = CurrentGroup.Regions[Idx];
     AttributeFuncs::mergeAttributesForOutlining(*CurrentGroup.OutlinedFunction,
                                                *CurrentOS->ExtractedFunction);
 
-    // Create a new BasicBlock to hold the needed store instructions.
-    BasicBlock *NewBB = BasicBlock::Create(
-        M.getContext(), "output_block_" + std::to_string(Idx),
-        CurrentGroup.OutlinedFunction);
-    replaceArgumentUses(*CurrentOS, NewBB);
+    // Create a set of BasicBlocks, one for each return block, to hold the
+    // needed store instructions.
+    DenseMap<Value *, BasicBlock *> NewBBs;
+    createAndInsertBasicBlocks(
+        CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction,
+        "output_block_" + Twine(static_cast<unsigned>(Idx)));
 
-    alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBB,
-                                CurrentGroup.EndBB, OutputMappings,
+    replaceArgumentUses(*CurrentOS, NewBBs);
+    alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs,
+                                CurrentGroup.EndBBs, OutputMappings,
                                 OutputStoreBBs);
 
     CurrentOS->Call = replaceCalledFunction(M, *CurrentOS);
@@ -1405,7 +1607,7 @@ void IROutliner::deduplicateExtractedSections(
   }
 
   // Create a switch statement to handle the 
diff erent output schemes.
-  createSwitchStatement(M, CurrentGroup, CurrentGroup.EndBB, OutputStoreBBs);
+  createSwitchStatement(M, CurrentGroup, CurrentGroup.EndBBs, OutputStoreBBs);
 
   OutlinedFunctionNum++;
 }

diff  --git a/llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll b/llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll
new file mode 100644
index 0000000000000..00c99488e231d
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we do not extract similar regions that would involve the splitting
+; of phi nodes on exit.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]])
+; CHECK-NEXT:    br label [[FIRST]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]])
+; CHECK-NEXT:    br label [[FIRST]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK: define internal void @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[TEST_TO_OUTLINE:%.*]]
+; CHECK:       test_to_outline:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    ret void
+;

diff  --git a/llvm/test/Transforms/IROutliner/outlining-multiple-exits-
diff -outputs.ll b/llvm/test/Transforms/IROutliner/outlining-multiple-exits-
diff -outputs.ll
new file mode 100644
index 0000000000000..aa9b7f2fc5933
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/outlining-multiple-exits-
diff -outputs.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Here we have multiple exits, but the 
diff erent sources, 
diff erent outputs are
+; needed, this checks that they are handled by separate switch statements.
+
+define void @outline_outputs1() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_6
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_7
+block_6:
+  %div = udiv i32 %aval, %bval
+  ret void
+block_7:
+  %sub = sub i32 %a2val, %b2val
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_7
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_6
+block_6:
+  %
diff  = sub i32 %a2val, %b2val
+  ret void
+block_7:
+  %quot = udiv i32 %add, %mul
+  ret void
+}
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BVAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[AVAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[LT_CAST2:%.*]] = bitcast i32* [[AVAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[BVAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[AVAL_LOC]], i32* [[BVAL_LOC]], i32 0)
+; CHECK-NEXT:    [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4
+; CHECK-NEXT:    [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4
+; CHECK-NEXT:    [[AVAL_RELOAD:%.*]] = load i32, i32* [[AVAL_LOC]], align 4
+; CHECK-NEXT:    [[BVAL_RELOAD:%.*]] = load i32, i32* [[BVAL_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BLOCK_6:%.*]], label [[BLOCK_7:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[AVAL_RELOAD]], [[BVAL_RELOAD]]
+; CHECK-NEXT:    ret void
+; CHECK:       block_7:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[LT_CAST2:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[MUL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[ADD_LOC]], i32* [[MUL_LOC]], i32 1)
+; CHECK-NEXT:    [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4
+; CHECK-NEXT:    [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[MUL_RELOAD:%.*]] = load i32, i32* [[MUL_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BLOCK_7:%.*]], label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]]
+; CHECK-NEXT:    ret void
+; CHECK:       block_7:
+; CHECK-NEXT:    [[QUOT:%.*]] = udiv i32 [[ADD_RELOAD]], [[MUL_RELOAD]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[BLOCK_2_TO_OUTLINE:%.*]]
+; CHECK:       block_2_to_outline:
+; CHECK-NEXT:    [[A2VAL:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[B2VAL:%.*]] = load i32, i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 2, [[A2VAL]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 2, [[B2VAL]]
+; CHECK-NEXT:    br label [[BLOCK_5:%.*]]
+; CHECK:       block_3:
+; CHECK-NEXT:    [[AVAL:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[BVAL:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 2, [[AVAL]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 2, [[BVAL]]
+; CHECK-NEXT:    br label [[BLOCK_4:%.*]]
+; CHECK:       block_4:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_EXITSTUB:%.*]]
+; CHECK:       block_5:
+; CHECK-NEXT:    store i32 [[ADD2]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[MUL2]], i32* [[TMP3]], align 4
+; CHECK-NEXT:    br label [[BLOCK_7_EXITSTUB:%.*]]
+; CHECK:       block_6.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP8:%.*]], label [[FINAL_BLOCK_1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_1:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       block_7.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP8]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_0:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_0:
+; CHECK-NEXT:    store i32 [[A2VAL]], i32* [[TMP4:%.*]], align 4
+; CHECK-NEXT:    store i32 [[B2VAL]], i32* [[TMP5:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_0_1:
+; CHECK-NEXT:    store i32 [[AVAL]], i32* [[TMP6:%.*]], align 4
+; CHECK-NEXT:    store i32 [[BVAL]], i32* [[TMP7:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[A2VAL]], i32* [[TMP4]], align 4
+; CHECK-NEXT:    store i32 [[B2VAL]], i32* [[TMP5]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_1_1:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP6]], align 4
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[TMP7]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       final_block_1:
+; CHECK-NEXT:    ret i1 true
+;

diff  --git a/llvm/test/Transforms/IROutliner/outlining-multiple-exits-one-output-set.ll b/llvm/test/Transforms/IROutliner/outlining-multiple-exits-one-output-set.ll
new file mode 100644
index 0000000000000..92ad123d9ce22
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/outlining-multiple-exits-one-output-set.ll
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Here we have multiple exits, but 
diff erent sources, and only one has an
+; output set.  We check to make sure that we do not generated extra output
+; blocks or entries in the switch statement.
+
+define void @outline_outputs1() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_6
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_7
+block_6:
+  ret void
+block_7:
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_7
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_6
+block_6:
+  %
diff  = sub i32 %a2val, %b2val
+  ret void
+block_7:
+  %quot = udiv i32 %add, %mul
+  ret void
+}
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* null, i32* null, i32* null, i32* null, i32 -1)
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BLOCK_6:%.*]], label [[BLOCK_7:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    ret void
+; CHECK:       block_7:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[LT_CAST2:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[MUL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    [[TMP0:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[ADD_LOC]], i32* [[MUL_LOC]], i32 0)
+; CHECK-NEXT:    [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4
+; CHECK-NEXT:    [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[MUL_RELOAD:%.*]] = load i32, i32* [[MUL_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    br i1 [[TMP0]], label [[BLOCK_7:%.*]], label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]]
+; CHECK-NEXT:    ret void
+; CHECK:       block_7:
+; CHECK-NEXT:    [[QUOT:%.*]] = udiv i32 [[ADD_RELOAD]], [[MUL_RELOAD]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[BLOCK_2_TO_OUTLINE:%.*]]
+; CHECK:       block_2_to_outline:
+; CHECK-NEXT:    [[A2VAL:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[B2VAL:%.*]] = load i32, i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 2, [[A2VAL]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 2, [[B2VAL]]
+; CHECK-NEXT:    br label [[BLOCK_5:%.*]]
+; CHECK:       block_3:
+; CHECK-NEXT:    [[AVAL:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[BVAL:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 2, [[AVAL]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 2, [[BVAL]]
+; CHECK-NEXT:    br label [[BLOCK_4:%.*]]
+; CHECK:       block_4:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_EXITSTUB:%.*]]
+; CHECK:       block_5:
+; CHECK-NEXT:    store i32 [[ADD2]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[MUL2]], i32* [[TMP3]], align 4
+; CHECK-NEXT:    br label [[BLOCK_7_EXITSTUB:%.*]]
+; CHECK:       block_6.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP8:%.*]], label [[FINAL_BLOCK_1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_1_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       block_7.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP8]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[A2VAL]], i32* [[TMP4:%.*]], align 4
+; CHECK-NEXT:    store i32 [[B2VAL]], i32* [[TMP5:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_1_1:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP6:%.*]], align 4
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[TMP7:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       final_block_1:
+; CHECK-NEXT:    ret i1 true
+;

diff  --git a/llvm/test/Transforms/IROutliner/outlining-multiple-exits.ll b/llvm/test/Transforms/IROutliner/outlining-multiple-exits.ll
index fbaf0efb9848d..210da7b042e14 100644
--- a/llvm/test/Transforms/IROutliner/outlining-multiple-exits.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-multiple-exits.ll
@@ -1,8 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
 ; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
 
-; Here we have multiple exits, but we can't actually outline anything but
-; single entry and single exits yet, we check to make sure it doesn't happen.
+; Here we have multiple exits, but the 
diff erent sources, same outputs are
+; needed, this checks that they are compressed, and moved into the appropriate
+; output blocks.
 
 define void @outline_outputs1() #0 {
 entry:
@@ -87,6 +88,10 @@ block_7:
 }
 ; CHECK-LABEL: @outline_outputs1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BVAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[AVAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2VAL_LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
@@ -99,34 +104,38 @@ block_7:
 ; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    br label [[BLOCK_2]]
 ; CHECK:       block_2:
-; CHECK-NEXT:    [[A2VAL:%.*]] = load i32, i32* [[A]], align 4
-; CHECK-NEXT:    [[B2VAL:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    [[ADD2:%.*]] = add i32 2, [[A2VAL]]
-; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 2, [[B2VAL]]
-; CHECK-NEXT:    br label [[BLOCK_5:%.*]]
-; CHECK:       block_3:
-; CHECK-NEXT:    [[AVAL:%.*]] = load i32, i32* [[A]], align 4
-; CHECK-NEXT:    [[BVAL:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 2, [[AVAL]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul i32 2, [[BVAL]]
-; CHECK-NEXT:    br label [[BLOCK_4:%.*]]
-; CHECK:       block_4:
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[OUTPUT]], align 4
-; CHECK-NEXT:    store i32 [[MUL]], i32* [[RESULT]], align 4
-; CHECK-NEXT:    br label [[BLOCK_6:%.*]]
-; CHECK:       block_5:
-; CHECK-NEXT:    call void @outlined_ir_func_0(i32 [[ADD2]], i32* [[OUTPUT]], i32 [[MUL2]], i32* [[RESULT]])
-; CHECK-NEXT:    br label [[BLOCK_7:%.*]]
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[LT_CAST2:%.*]] = bitcast i32* [[AVAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[BVAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[AVAL_LOC]], i32* [[BVAL_LOC]])
+; CHECK-NEXT:    [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4
+; CHECK-NEXT:    [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4
+; CHECK-NEXT:    [[AVAL_RELOAD:%.*]] = load i32, i32* [[AVAL_LOC]], align 4
+; CHECK-NEXT:    [[BVAL_RELOAD:%.*]] = load i32, i32* [[BVAL_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    br i1 [[TARGETBLOCK]], label [[BLOCK_6:%.*]], label [[BLOCK_7:%.*]]
 ; CHECK:       block_6:
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[AVAL]], [[BVAL]]
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[AVAL_RELOAD]], [[BVAL_RELOAD]]
 ; CHECK-NEXT:    ret void
 ; CHECK:       block_7:
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[A2VAL]], [[B2VAL]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]]
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: @outline_outputs2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BVAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[AVAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2VAL_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A2VAL_LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
@@ -139,39 +148,61 @@ block_7:
 ; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    br label [[BLOCK_2]]
 ; CHECK:       block_2:
-; CHECK-NEXT:    [[A2VAL:%.*]] = load i32, i32* [[A]], align 4
-; CHECK-NEXT:    [[B2VAL:%.*]] = load i32, i32* [[B]], align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[A2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[B2VAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[LT_CAST2:%.*]] = bitcast i32* [[AVAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    [[LT_CAST3:%.*]] = bitcast i32* [[BVAL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[A2VAL_LOC]], i32* [[B2VAL_LOC]], i32* [[AVAL_LOC]], i32* [[BVAL_LOC]])
+; CHECK-NEXT:    [[A2VAL_RELOAD:%.*]] = load i32, i32* [[A2VAL_LOC]], align 4
+; CHECK-NEXT:    [[B2VAL_RELOAD:%.*]] = load i32, i32* [[B2VAL_LOC]], align 4
+; CHECK-NEXT:    [[AVAL_RELOAD:%.*]] = load i32, i32* [[AVAL_LOC]], align 4
+; CHECK-NEXT:    [[BVAL_RELOAD:%.*]] = load i32, i32* [[BVAL_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST3]])
+; CHECK-NEXT:    br i1 [[TARGETBLOCK]], label [[BLOCK_7:%.*]], label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[A2VAL_RELOAD]], [[B2VAL_RELOAD]]
+; CHECK-NEXT:    ret void
+; CHECK:       block_7:
+; CHECK-NEXT:    [[QUOT:%.*]] = udiv i32 [[AVAL_RELOAD]], [[BVAL_RELOAD]]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[BLOCK_2_TO_OUTLINE:%.*]]
+; CHECK:       block_2_to_outline:
+; CHECK-NEXT:    [[A2VAL:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[B2VAL:%.*]] = load i32, i32* [[TMP1:%.*]], align 4
 ; CHECK-NEXT:    [[ADD2:%.*]] = add i32 2, [[A2VAL]]
 ; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 2, [[B2VAL]]
 ; CHECK-NEXT:    br label [[BLOCK_5:%.*]]
 ; CHECK:       block_3:
-; CHECK-NEXT:    [[AVAL:%.*]] = load i32, i32* [[A]], align 4
-; CHECK-NEXT:    [[BVAL:%.*]] = load i32, i32* [[B]], align 4
+; CHECK-NEXT:    [[AVAL:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[BVAL:%.*]] = load i32, i32* [[TMP1]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 2, [[AVAL]]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i32 2, [[BVAL]]
 ; CHECK-NEXT:    br label [[BLOCK_4:%.*]]
 ; CHECK:       block_4:
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[OUTPUT]], align 4
-; CHECK-NEXT:    store i32 [[MUL]], i32* [[RESULT]], align 4
-; CHECK-NEXT:    br label [[BLOCK_7:%.*]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_EXITSTUB:%.*]]
 ; CHECK:       block_5:
-; CHECK-NEXT:    call void @outlined_ir_func_0(i32 [[ADD2]], i32* [[OUTPUT]], i32 [[MUL2]], i32* [[RESULT]])
-; CHECK-NEXT:    br label [[BLOCK_6:%.*]]
-; CHECK:       block_6:
-; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[A2VAL]], [[B2VAL]]
-; CHECK-NEXT:    ret void
-; CHECK:       block_7:
-; CHECK-NEXT:    [[QUOT:%.*]] = udiv i32 [[AVAL]], [[BVAL]]
-; CHECK-NEXT:    ret void
-;
-;
-; CHECK: define internal void @outlined_ir_func_0(
-; CHECK-NEXT:  newFuncRoot:
-; CHECK-NEXT:    br label [[BLOCK_5_TO_OUTLINE:%.*]]
-; CHECK:       block_5_to_outline:
-; CHECK-NEXT:    store i32 [[TMP0:%.*]], i32* [[TMP1:%.*]], align 4
-; CHECK-NEXT:    store i32 [[TMP2:%.*]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    store i32 [[ADD2]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[MUL2]], i32* [[TMP3]], align 4
 ; CHECK-NEXT:    br label [[BLOCK_7_EXITSTUB:%.*]]
+; CHECK:       block_6.exitStub:
+; CHECK-NEXT:    store i32 [[AVAL]], i32* [[TMP6:%.*]], align 4
+; CHECK-NEXT:    store i32 [[BVAL]], i32* [[TMP7:%.*]], align 4
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       block_7.exitStub:
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    store i32 [[A2VAL]], i32* [[TMP4:%.*]], align 4
+; CHECK-NEXT:    store i32 [[B2VAL]], i32* [[TMP5:%.*]], align 4
+; CHECK-NEXT:    ret i1 false
 ;