[llvm-branch-commits] [llvm] e6ae623 - [IROutliner] Adding support for consolidating functions with different output arguments.

Mon Dec 28 14:21:53 PST 2020

Author: Andrew Litteken
Date: 2020-12-28T16:17:07-06:00
New Revision: e6ae623314bab3ddd983ed941bf63a6d4c63a1f4

URL: https://github.com/llvm/llvm-project/commit/e6ae623314bab3ddd983ed941bf63a6d4c63a1f4
DIFF: https://github.com/llvm/llvm-project/commit/e6ae623314bab3ddd983ed941bf63a6d4c63a1f4.diff

LOG: [IROutliner] Adding support for consolidating functions with different output arguments.

Certain regions can have values introduced inside the region that are
used outside of the region. These may not be the same for each similar
region, so we must create one over arching set of arguments for the
consolidated function.

We do this by iterating over the outputs for each extracted function,
and creating as many different arguments to encapsulate the different
outputs sets. For each output set, we create a different block with the
necessary stores from the value to the output register. There is then
one switch statement, controlled by an argument to the function, to
differentiate which block to use.

Changed Tests for consistency:
llvm/test/Transforms/IROutliner/extraction.ll
llvm/test/Transforms/IROutliner/illegal-assumes.ll
llvm/test/Transforms/IROutliner/illegal-memcpy.ll
llvm/test/Transforms/IROutliner/illegal-memmove.ll
llvm/test/Transforms/IROutliner/illegal-vaarg.ll

Tests to test new functionality:
llvm/test/Transforms/IROutliner/outlining-different-output-blocks.ll
llvm/test/Transforms/IROutliner/outlining-remapped-outputs.ll
llvm/test/Transforms/IROutliner/outlining-same-output-blocks.ll

Reviewers: jroelofs, paquette

Differential Revision: https://reviews.llvm.org/D87296

Added: 
    llvm/test/Transforms/IROutliner/outlining-different-output-blocks.ll
    llvm/test/Transforms/IROutliner/outlining-remapped-outputs.ll
    llvm/test/Transforms/IROutliner/outlining-same-output-blocks.ll

Modified: 
    llvm/include/llvm/Transforms/IPO/IROutliner.h
    llvm/lib/Transforms/IPO/IROutliner.cpp
    llvm/test/Transforms/IROutliner/extraction.ll
    llvm/test/Transforms/IROutliner/illegal-assumes.ll
    llvm/test/Transforms/IROutliner/illegal-memcpy.ll
    llvm/test/Transforms/IROutliner/illegal-memmove.ll
    llvm/test/Transforms/IROutliner/illegal-vaarg.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/IPO/IROutliner.h b/llvm/include/llvm/Transforms/IPO/IROutliner.h
index 87f276d82df7..2048d6d6d1a1 100644

--- a/llvm/include/llvm/Transforms/IPO/IROutliner.h
+++ b/llvm/include/llvm/Transforms/IPO/IROutliner.h
@@ -73,6 +73,10 @@ struct OutlinableRegion {
   /// The number of extracted inputs from the CodeExtractor.
   unsigned NumExtractedInputs;
 
+  /// The corresponding BasicBlock with the appropriate stores for this
+  /// OutlinableRegion in the overall function.
+  unsigned OutputBlockNum;
+
   /// Mapping the extracted argument number to the argument number in the
   /// overall function.  Since there will be inputs, such as elevated constants
   /// that are not the same in each region in a SimilarityGroup, or values that
@@ -87,6 +91,11 @@ struct OutlinableRegion {
   /// since the CodeExtractor does not recognize constants.
   DenseMap<unsigned, Constant *> AggArgToConstant;
 
+  /// The global value numbers that are used as outputs for this section. Once
+  /// extracted, each output will be stored to an output register.  This
+  /// documents the global value numbers that are used in this pattern.
+  SmallVector<unsigned, 4> GVNStores;
+
   /// Used to create an outlined function.
   CodeExtractor *CE = nullptr;
 
@@ -192,6 +201,15 @@ class IROutliner {
   void findAddInputsOutputs(Module &M, OutlinableRegion &Region,
                             DenseSet<unsigned> &NotSame);
 
+  /// Update the output mapping based on the load instruction, and the outputs
+  /// of the extracted function.
+  ///
+  /// \param Region - The region extracted
+  /// \param Outputs - The outputs from the extracted function.
+  /// \param LI - The load instruction used to update the mapping.
+  void updateOutputMapping(OutlinableRegion &Region,
+                           ArrayRef<Value *> Outputs, LoadInst *LI);
+
   /// Extract \p Region into its own function.
   ///
   /// \param [in] Region - The region to be extracted into its own function.
@@ -218,6 +236,11 @@ class IROutliner {
   /// TargetTransformInfo lambda for target specific information.
   function_ref<TargetTransformInfo &(Function &)> getTTI;
 
+  /// A mapping from newly created reloaded output values to the original value.
+  /// If an value is replace by an output from an outlined region, this maps
+  /// that Value, back to its original Value.
+  DenseMap<Value *, Value *> OutputMappings;
+
   /// IRSimilarityIdentifier lambda to retrieve IRSimilarityIdentifier.
   function_ref<IRSimilarityIdentifier &(Module &)> getIRSI;
 

diff  --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index 12a30744a652..ec6bfaef26ec 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -50,6 +50,9 @@ struct OutlinableGroup {
   /// for extraction.
   bool IgnoreGroup = false;
 
+  /// The return block for the overall function.
+  BasicBlock *EndBB = nullptr;
+
   /// Flag for whether the \ref ArgumentTypes have been defined after the
   /// extraction of the first region.
   bool InputTypesSet = false;
@@ -343,18 +346,48 @@ static void findConstants(IRSimilarityCandidate &C, DenseSet<unsigned> &NotSame,
 /// CodeExtractor.
 /// \param [out] EndInputNumbers - The global value numbers for the extracted
 /// arguments.
+/// \param [in] OutputMappings - The mapping of values that have been replaced
+/// by a new output value.
+/// \param [out] EndInputs - The global value numbers for the extracted
+/// arguments.
 static void mapInputsToGVNs(IRSimilarityCandidate &C,
                             SetVector<Value *> &CurrentInputs,
+                            const DenseMap<Value *, Value *> &OutputMappings,
                             std::vector<unsigned> &EndInputNumbers) {
-  // Get the global value number for each input.
+  // Get the Global Value Number for each input.  We check if the Value has been
+  // replaced by a 
diff erent value at output, and use the original value before
+  // replacement.
   for (Value *Input : CurrentInputs) {
     assert(Input && "Have a nullptr as an input");
+    if (OutputMappings.find(Input) != OutputMappings.end())
+      Input = OutputMappings.find(Input)->second;
     assert(C.getGVN(Input).hasValue() &&
            "Could not find a numbering for the given input");
     EndInputNumbers.push_back(C.getGVN(Input).getValue());
   }
 }
 
+/// Find the original value for the \p ArgInput values if any one of them was
+/// replaced during a previous extraction.
+///
+/// \param [in] ArgInputs - The inputs to be extracted by the code extractor.
+/// \param [in] OutputMappings - The mapping of values that have been replaced
+/// by a new output value.
+/// \param [out] RemappedArgInputs - The remapped values according to
+/// \p OutputMappings that will be extracted.
+static void
+remapExtractedInputs(const ArrayRef<Value *> ArgInputs,
+                     const DenseMap<Value *, Value *> &OutputMappings,
+                     SetVector<Value *> &RemappedArgInputs) {
+  // Get the global value number for each input that will be extracted as an
+  // argument by the code extractor, remapping if needed for reloaded values.
+  for (Value *Input : ArgInputs) {
+    if (OutputMappings.find(Input) != OutputMappings.end())
+      Input = OutputMappings.find(Input)->second;
+    RemappedArgInputs.insert(Input);
+  }
+}
+
 /// Find the input GVNs and the output values for a region of Instructions.
 /// Using the code extractor, we collect the inputs to the extracted function.
 ///
@@ -368,19 +401,25 @@ static void mapInputsToGVNs(IRSimilarityCandidate &C,
 /// \param [in] NotSame - The global value numbers in the region that do not
 /// have the same constant value in the regions structurally similar to
 /// \p Region.
+/// \param [in] OutputMappings - The mapping of values that have been replaced
+/// by a new output value after extraction.
 /// \param [out] ArgInputs - The values of the inputs to the extracted function.
-static void getCodeExtractorArguments(OutlinableRegion &Region,
-                                      std::vector<unsigned> &InputGVNs,
-                                      DenseSet<unsigned> &NotSame,
-                                      SetVector<Value *> &ArgInputs) {
+/// \param [out] Outputs - The set of values extracted by the CodeExtractor
+/// as outputs.
+static void getCodeExtractorArguments(
+    OutlinableRegion &Region, std::vector<unsigned> &InputGVNs,
+    DenseSet<unsigned> &NotSame, DenseMap<Value *, Value *> &OutputMappings,
+    SetVector<Value *> &ArgInputs, SetVector<Value *> &Outputs) {
   IRSimilarityCandidate &C = *Region.Candidate;
 
   // OverallInputs are the inputs to the region found by the CodeExtractor,
   // SinkCands and HoistCands are used by the CodeExtractor to find sunken
   // allocas of values whose lifetimes are contained completely within the
-  // outlined region. Outputs are values used outside of the outlined region
-  // found by the CodeExtractor.
-  SetVector<Value *> OverallInputs, SinkCands, HoistCands, Outputs;
+  // outlined region. PremappedInputs are the arguments found by the
+  // CodeExtractor, removing conditions such as sunken allocas, but that
+  // may need to be remapped due to the extracted output values replacing
+  // the original values.
+  SetVector<Value *> OverallInputs, PremappedInputs, SinkCands, HoistCands;
 
   // Use the code extractor to get the inputs and outputs, without sunken
   // allocas or removing llvm.assumes.
@@ -400,27 +439,24 @@ static void getCodeExtractorArguments(OutlinableRegion &Region,
 
   // Find if any values are going to be sunk into the function when extracted
   CE->findAllocas(CEAC, SinkCands, HoistCands, Dummy);
-  CE->findInputsOutputs(ArgInputs, Outputs, SinkCands);
-
-  // TODO: Support regions with output values.  Outputs add an extra layer of
-  // resolution that adds too much complexity at this stage.
-  if (Outputs.size() > 0) {
-    Region.IgnoreRegion = true;
-    return;
-  }
+  CE->findInputsOutputs(PremappedInputs, Outputs, SinkCands);
 
   // TODO: Support regions with sunken allocas: values whose lifetimes are
   // contained completely within the outlined region.  These are not guaranteed
   // to be the same in every region, so we must elevate them all to arguments
   // when they appear.  If these values are not equal, it means there is some
   // Input in OverallInputs that was removed for ArgInputs.
-  if (ArgInputs.size() != OverallInputs.size()) {
+  if (OverallInputs.size() != PremappedInputs.size()) {
     Region.IgnoreRegion = true;
     return;
   }
 
   findConstants(C, NotSame, InputGVNs);
-  mapInputsToGVNs(C, OverallInputs, InputGVNs);
+
+  mapInputsToGVNs(C, OverallInputs, OutputMappings, InputGVNs);
+
+  remapExtractedInputs(PremappedInputs.getArrayRef(), OutputMappings,
+                       ArgInputs);
 
   // Sort the GVNs, since we now have constants included in the \ref InputGVNs
   // we need to make sure they are in a deterministic order.
@@ -439,7 +475,7 @@ static void getCodeExtractorArguments(OutlinableRegion &Region,
 /// function.
 static void
 findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
-                                        std::vector<unsigned> InputGVNs,
+                                        std::vector<unsigned> &InputGVNs,
                                         SetVector<Value *> &ArgInputs) {
 
   IRSimilarityCandidate &C = *Region.Candidate;
@@ -494,12 +530,82 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
   Region.NumExtractedInputs = OriginalIndex;
 }
 
+/// Create a mapping of the output arguments for the \p Region to the output
+/// arguments of the overall outlined function.
+///
+/// \param [in,out] Region - The region of code to be analyzed.
+/// \param [in] Outputs - The values found by the code extractor.
+static void
+findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
+                                          ArrayRef<Value *> Outputs) {
+  OutlinableGroup &Group = *Region.Parent;
+  IRSimilarityCandidate &C = *Region.Candidate;
+
+  // This counts the argument number in the extracted function.
+  unsigned OriginalIndex = Region.NumExtractedInputs;
+
+  // This counts the argument number in the overall function.
+  unsigned TypeIndex = Group.NumAggregateInputs;
+  bool TypeFound;
+  DenseSet<unsigned> AggArgsUsed;
+
+  // Iterate over the output types and identify if there is an aggregate pointer
+  // type whose base type matches the current output type. If there is, we mark
+  // that we will use this output register for this value. If not we add another
+  // type to the overall argument type list. We also store the GVNs used for
+  // stores to identify which values will need to be moved into an special
+  // block that holds the stores to the output registers.
+  for (Value *Output : Outputs) {
+    TypeFound = false;
+    // We can do this since it is a result value, and will have a number
+    // that is necessarily the same. BUT if in the future, the instructions
+    // do not have to be in same order, but are functionally the same, we will
+    // have to use a 
diff erent scheme, as one-to-one correspondence is not
+    // guaranteed.
+    unsigned GlobalValue = C.getGVN(Output).getValue();
+    unsigned ArgumentSize = Group.ArgumentTypes.size();
+
+    for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) {
+      if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType()))
+        continue;
+
+      if (AggArgsUsed.find(Jdx) != AggArgsUsed.end())
+        continue;
+
+      TypeFound = true;
+      AggArgsUsed.insert(Jdx);
+      Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx));
+      Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex));
+      Region.GVNStores.push_back(GlobalValue);
+      break;
+    }
+
+    // We were unable to find an unused type in the output type set that matches
+    // the output, so we add a pointer type to the argument types of the overall
+    // function to handle this output and create a mapping to it.
+    if (!TypeFound) {
+      Group.ArgumentTypes.push_back(PointerType::getUnqual(Output->getType()));
+      AggArgsUsed.insert(Group.ArgumentTypes.size() - 1);
+      Region.ExtractedArgToAgg.insert(
+          std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1));
+      Region.AggArgToExtracted.insert(
+          std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex));
+      Region.GVNStores.push_back(GlobalValue);
+    }
+
+    stable_sort(Region.GVNStores);
+    OriginalIndex++;
+    TypeIndex++;
+  }
+}
+
 void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region,
                                       DenseSet<unsigned> &NotSame) {
   std::vector<unsigned> Inputs;
-  SetVector<Value *> ArgInputs;
+  SetVector<Value *> ArgInputs, Outputs;
 
-  getCodeExtractorArguments(Region, Inputs, NotSame, ArgInputs);
+  getCodeExtractorArguments(Region, Inputs, NotSame, OutputMappings, ArgInputs,
+                            Outputs);
 
   if (Region.IgnoreRegion)
     return;
@@ -507,6 +613,10 @@ void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region,
   // Map the inputs found by the CodeExtractor to the arguments found for
   // the overall function.
   findExtractedInputToOverallInputMapping(Region, Inputs, ArgInputs);
+
+  // Map the outputs found by the CodeExtractor to the arguments found for
+  // the overall function.
+  findExtractedOutputToOverallOutputMapping(Region, Outputs.getArrayRef());
 }
 
 /// Replace the extracted function in the Region with a call to the overall
@@ -544,6 +654,18 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
   // new argument list.
   for (unsigned AggArgIdx = 0; AggArgIdx < AggFunc->arg_size(); AggArgIdx++) {
 
+    if (AggArgIdx == AggFunc->arg_size() - 1 &&
+        Group.ArgumentTypes.size() > Group.NumAggregateInputs) {
+      // If we are on the last argument, and we need to 
diff erentiate between
+      // output blocks, add an integer to the argument list to determine
+      // what block to take
+      LLVM_DEBUG(dbgs() << "Set switch block argument to "
+                        << Region.OutputBlockNum << "\n");
+      NewCallArgs.push_back(ConstantInt::get(Type::getInt32Ty(M.getContext()),
+                                             Region.OutputBlockNum));
+      continue;
+    }
+
     ArgPair = Region.AggArgToExtracted.find(AggArgIdx);
     if (ArgPair != Region.AggArgToExtracted.end()) {
       Value *ArgumentValue = Call->getArgOperand(ArgPair->second);
@@ -603,8 +725,11 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
 // Within an extracted function, replace the argument uses of the extracted
 // region with the arguments of the function for an OutlinableGroup.
 //
-// \param OS [in] - The region of extracted code to be changed.
-static void replaceArgumentUses(OutlinableRegion &Region) {
+/// \param [in] Region - The region of extracted code to be changed.
+/// \param [in,out] OutputBB - The BasicBlock for the output stores for this
+/// region.
+static void replaceArgumentUses(OutlinableRegion &Region,
+                                BasicBlock *OutputBB) {
   OutlinableGroup &Group = *Region.Parent;
   assert(Region.ExtractedFunction && "Region has no extracted function?");
 
@@ -618,7 +743,29 @@ static void replaceArgumentUses(OutlinableRegion &Region) {
     Argument *Arg = Region.ExtractedFunction->getArg(ArgIdx);
     // The argument is an input, so we can simply replace it with the overall
     // argument value
-    LLVM_DEBUG(dbgs() << "Replacing uses of input " << *Arg << " in function "
+    if (ArgIdx < Region.NumExtractedInputs) {
+      LLVM_DEBUG(dbgs() << "Replacing uses of input " << *Arg << " in function "
+                        << *Region.ExtractedFunction << " with " << *AggArg
+                        << " in function " << *Group.OutlinedFunction << "\n");
+      Arg->replaceAllUsesWith(AggArg);
+      continue;
+    }
+
+    // If we are replacing an output, we place the store value in its own
+    // block inside the overall function before replacing the use of the output
+    // in the function.
+    assert(Arg->hasOneUse() && "Output argument can only have one use");
+    User *InstAsUser = Arg->user_back();
+    assert(InstAsUser && "User is nullptr!");
+
+    Instruction *I = cast<Instruction>(InstAsUser);
+    I->setDebugLoc(DebugLoc());
+    LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
+                      << *OutputBB << "\n");
+
+    I->moveBefore(*OutputBB, OutputBB->end());
+
+    LLVM_DEBUG(dbgs() << "Replacing uses of output " << *Arg << " in function "
                       << *Region.ExtractedFunction << " with " << *AggArg
                       << " in function " << *Group.OutlinedFunction << "\n");
     Arg->replaceAllUsesWith(AggArg);
@@ -656,38 +803,181 @@ void replaceConstants(OutlinableRegion &Region) {
   }
 }
 
+/// For the given function, find all the nondebug or lifetime instructions,
+/// and return them as a vector. Exclude any blocks in \p ExludeBlocks.
+///
+/// \param [in] F - The function we collect the instructions from.
+/// \param [in] ExcludeBlocks - BasicBlocks to ignore.
+/// \returns the list of instructions extracted.
+static std::vector<Instruction *>
+collectRelevantInstructions(Function &F,
+                            DenseSet<BasicBlock *> &ExcludeBlocks) {
+  std::vector<Instruction *> RelevantInstructions;
+
+  for (BasicBlock &BB : F) {
+    if (ExcludeBlocks.find(&BB) != ExcludeBlocks.end())
+      continue;
+
+    for (Instruction &Inst : BB) {
+      if (Inst.isLifetimeStartOrEnd())
+        continue;
+      if (isa<DbgInfoIntrinsic>(Inst))
+        continue;
+
+      RelevantInstructions.push_back(&Inst);
+    }
+  }
+
+  return RelevantInstructions;
+}
+
+/// For the outlined section, move needed the StoreInsts for the output
+/// registers into their own block.  Then, determine if there is a duplicate
+/// output block already created.
+///
+/// \param [in] OG - The OutlinableGroup of regions to be outlined.
+/// \param [in] Region - The OutlinableRegion that is being analyzed.
+/// \param [in,out] OutputBB - the block that stores for this region will be
+/// placed in.
+/// \param [in] EndBB - the final block of the extracted function.
+/// \param [in] OutputMappings - OutputMappings the mapping of values that have
+/// been replaced by a new output value.
+/// \param [in,out] OutputStoreBBs - The existing output blocks.
+static void
+alignOutputBlockWithAggFunc(OutlinableGroup &OG, OutlinableRegion &Region,
+                            BasicBlock *OutputBB, BasicBlock *EndBB,
+                            const DenseMap<Value *, Value *> &OutputMappings,
+                            std::vector<BasicBlock *> &OutputStoreBBs) {
+  DenseSet<unsigned> ValuesToFind(Region.GVNStores.begin(),
+                                  Region.GVNStores.end());
+
+  // We iterate over the instructions in the extracted function, and find the
+  // global value number of the instructions.  If we find a value that should
+  // be contained in a store, we replace the uses of the value with the value
+  // from the overall function, so that the store is storing the correct
+  // value from the overall function.
+
+  DenseSet<BasicBlock *> ExcludeBBs(OutputStoreBBs.begin(),
+                                    OutputStoreBBs.end());
+  std::vector<Instruction *> ExtractedFunctionInsts =
+      collectRelevantInstructions(*(Region.ExtractedFunction), ExcludeBBs);
+  std::vector<Instruction *> OverallFunctionInsts =
+      collectRelevantInstructions(*OG.OutlinedFunction, ExcludeBBs);
+
+  assert(ExtractedFunctionInsts.size() == OverallFunctionInsts.size() &&
+         "Number of relevant instructions not equal!");
+
+  unsigned NumInstructions = ExtractedFunctionInsts.size();
+  for (unsigned Idx = 0; Idx < NumInstructions; Idx++) {
+    Value *V = ExtractedFunctionInsts[Idx];
+
+    if (OutputMappings.find(V) != OutputMappings.end())
+      V = OutputMappings.find(V)->second;
+    Optional<unsigned> GVN = Region.Candidate->getGVN(V);
+
+    // If we have found one of the stored values for output, replace the value
+    // with the corresponding one from the overall function.
+    if (GVN.hasValue() &&
+        ValuesToFind.find(GVN.getValue()) != ValuesToFind.end()) {
+      ValuesToFind.erase(GVN.getValue());
+      V->replaceAllUsesWith(OverallFunctionInsts[Idx]);
+      if (ValuesToFind.size() == 0)
+        break;
+    }
+
+    if (ValuesToFind.size() == 0)
+      break;
+  }
+
+  assert(ValuesToFind.size() == 0 && "Not all store values were handled!");
+}
+
+/// Create the switch statement for outlined function to 
diff erentiate between
+/// all the output blocks.
+///
+/// For the outlined section, determine if an outlined block already exists that
+/// matches the needed stores for the extracted section.
+/// \param [in] M - The module we are outlining from.
+/// \param [in] OG - The group of regions to be outlined.
+/// \param [in] OS - The region that is being analyzed.
+/// \param [in] EndBB - The final block of the extracted function.
+/// \param [in,out] OutputStoreBBs - The existing output blocks.
+void createSwitchStatement(Module &M, OutlinableGroup &OG, BasicBlock *EndBB,
+                           ArrayRef<BasicBlock *> OutputStoreBBs) {
+  Function *AggFunc = OG.OutlinedFunction;
+  // Create a final block
+  BasicBlock *ReturnBlock =
+      BasicBlock::Create(M.getContext(), "final_block", AggFunc);
+  Instruction *Term = EndBB->getTerminator();
+  Term->moveBefore(*ReturnBlock, ReturnBlock->end());
+  // Put the switch statement in the old end basic block for the function with
+  // a fall through to the new return block
+  LLVM_DEBUG(dbgs() << "Create switch statement in " << *AggFunc << " for "
+                    << OutputStoreBBs.size() << "\n");
+  SwitchInst *SwitchI =
+      SwitchInst::Create(AggFunc->getArg(AggFunc->arg_size() - 1), ReturnBlock,
+                         OutputStoreBBs.size(), EndBB);
+
+  unsigned Idx = 0;
+  for (BasicBlock *BB : OutputStoreBBs) {
+    SwitchI->addCase(ConstantInt::get(Type::getInt32Ty(M.getContext()), Idx),
+                     BB);
+    Term = BB->getTerminator();
+    Term->setSuccessor(0, ReturnBlock);
+    Idx++;
+  }
+
+  return;
+}
+
 /// Fill the new function that will serve as the replacement function for all of
 /// the extracted regions of a certain structure from the first region in the
 /// list of regions.  Replace this first region's extracted function with the
 /// new overall function.
 ///
-/// \param M [in] - The module we are outlining from.
-/// \param CurrentGroup [in] - The group of regions to be outlined.
-/// \param FuncsToRemove [in,out] - Extracted functions to erase from module
+/// \param [in] M - The module we are outlining from.
+/// \param [in] CurrentGroup - The group of regions to be outlined.
+/// \param [in,out] OutputStoreBBs - The output blocks for each 
diff erent
+/// set of stores needed for the 
diff erent functions.
+/// \param [in,out] FuncsToRemove - Extracted functions to erase from module
 /// once outlining is complete.
 static void fillOverallFunction(Module &M, OutlinableGroup &CurrentGroup,
+                                std::vector<BasicBlock *> &OutputStoreBBs,
                                 std::vector<Function *> &FuncsToRemove) {
   OutlinableRegion *CurrentOS = CurrentGroup.Regions[0];
 
-  // Move first extracted function's instructions into new function
+  // Move first extracted function's instructions into new function.
   LLVM_DEBUG(dbgs() << "Move instructions from "
                     << *CurrentOS->ExtractedFunction << " to instruction "
                     << *CurrentGroup.OutlinedFunction << "\n");
-  moveFunctionData(*CurrentOS->ExtractedFunction,
-                   *CurrentGroup.OutlinedFunction);
 
-  // Transfer the attributes
+  CurrentGroup.EndBB = moveFunctionData(*CurrentOS->ExtractedFunction,
+                                        *CurrentGroup.OutlinedFunction);
+
+  // Transfer the attributes from the function to the new function.
   for (Attribute A :
        CurrentOS->ExtractedFunction->getAttributes().getFnAttributes())
     CurrentGroup.OutlinedFunction->addFnAttr(A);
 
-  replaceArgumentUses(*CurrentOS);
+  // Create an output block for the first extracted function.
+  BasicBlock *NewBB = BasicBlock::Create(
+      M.getContext(), Twine("output_block_") + Twine(static_cast<unsigned>(0)),
+      CurrentGroup.OutlinedFunction);
+  CurrentOS->OutputBlockNum = 0;
+
+  replaceArgumentUses(*CurrentOS, NewBB);
   replaceConstants(*CurrentOS);
 
+  if (CurrentGroup.ArgumentTypes.size() > CurrentGroup.NumAggregateInputs) {
+    BranchInst::Create(CurrentGroup.EndBB, NewBB);
+    OutputStoreBBs.push_back(NewBB);
+  } else
+    NewBB->eraseFromParent();
+
   // Replace the call to the extracted function with the outlined function.
   CurrentOS->Call = replaceCalledFunction(M, *CurrentOS);
 
-  // We only delete the extracted funcitons at the end since we may need to
+  // We only delete the extracted functions at the end since we may need to
   // reference instructions contained in them for mapping purposes.
   FuncsToRemove.push_back(CurrentOS->ExtractedFunction);
 }
@@ -701,17 +991,35 @@ void IROutliner::deduplicateExtractedSections(
 
   OutlinableRegion *CurrentOS;
 
-  fillOverallFunction(M, CurrentGroup, FuncsToRemove);
+  fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove);
 
-  // Do the same for the other extracted functions
   for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) {
     CurrentOS = CurrentGroup.Regions[Idx];
 
-    replaceArgumentUses(*CurrentOS);
+    // Create a new BasicBlock to hold the needed store instructions.
+    BasicBlock *NewBB = BasicBlock::Create(
+        M.getContext(), "output_block_" + std::to_string(Idx),
+        CurrentGroup.OutlinedFunction);
+    replaceArgumentUses(*CurrentOS, NewBB);
+
+    if (CurrentGroup.ArgumentTypes.size() > CurrentGroup.NumAggregateInputs) {
+      BranchInst::Create(CurrentGroup.EndBB, NewBB);
+      CurrentOS->OutputBlockNum = OutputStoreBBs.size();
+      OutputStoreBBs.push_back(NewBB);
+      alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBB,
+                                  CurrentGroup.EndBB, OutputMappings,
+                                  OutputStoreBBs);
+    } else
+      NewBB->eraseFromParent();
+
     CurrentOS->Call = replaceCalledFunction(M, *CurrentOS);
     FuncsToRemove.push_back(CurrentOS->ExtractedFunction);
   }
 
+  // Create a switch statement to handle the 
diff erent output schemes.
+  if (CurrentGroup.ArgumentTypes.size() > CurrentGroup.NumAggregateInputs)
+    createSwitchStatement(M, CurrentGroup, CurrentGroup.EndBB, OutputStoreBBs);
+
   OutlinedFunctionNum++;
 }
 
@@ -766,11 +1074,45 @@ void IROutliner::pruneIncompatibleRegions(
   }
 }
 
+void IROutliner::updateOutputMapping(OutlinableRegion &Region,
+                                     ArrayRef<Value *> Outputs,
+                                     LoadInst *LI) {
+  // For and load instructions following the call
+  Value *Operand = LI->getPointerOperand();
+  Optional<unsigned> OutputIdx = None;
+  // Find if the operand it is an output register.
+  for (unsigned ArgIdx = Region.NumExtractedInputs;
+       ArgIdx < Region.Call->arg_size(); ArgIdx++) {
+    if (Operand == Region.Call->getArgOperand(ArgIdx)) {
+      OutputIdx = ArgIdx - Region.NumExtractedInputs;
+      break;
+    }
+  }
+
+  // If we found an output register, place a mapping of the new value
+  // to the original in the mapping.
+  if (!OutputIdx.hasValue())
+    return;
+
+  if (OutputMappings.find(Outputs[OutputIdx.getValue()]) ==
+      OutputMappings.end()) {
+    LLVM_DEBUG(dbgs() << "Mapping extracted output " << *LI << " to "
+                      << *Outputs[OutputIdx.getValue()] << "\n");
+    OutputMappings.insert(std::make_pair(LI, Outputs[OutputIdx.getValue()]));
+  } else {
+    Value *Orig = OutputMappings.find(Outputs[OutputIdx.getValue()])->second;
+    LLVM_DEBUG(dbgs() << "Mapping extracted output " << *Orig << " to "
+                      << *Outputs[OutputIdx.getValue()] << "\n");
+    OutputMappings.insert(std::make_pair(LI, Orig));
+  }
+}
+
 bool IROutliner::extractSection(OutlinableRegion &Region) {
-  assert(Region.StartBB != nullptr &&
-         "StartBB for the OutlinableRegion is nullptr!");
-  assert(Region.FollowBB != nullptr &&
-         "StartBB for the OutlinableRegion is nullptr!");
+  SetVector<Value *> ArgInputs, Outputs, SinkCands;
+  Region.CE->findInputsOutputs(ArgInputs, Outputs, SinkCands);
+
+  assert(Region.StartBB && "StartBB for the OutlinableRegion is nullptr!");
+  assert(Region.FollowBB && "FollowBB for the OutlinableRegion is nullptr!");
   Function *OrigF = Region.StartBB->getParent();
   CodeExtractorAnalysisCache CEAC(*OrigF);
   Region.ExtractedFunction = Region.CE->extractCodeRegion(CEAC);
@@ -816,17 +1158,17 @@ bool IROutliner::extractSection(OutlinableRegion &Region) {
   // Iterate over the new set of instructions to find the new call
   // instruction.
   for (Instruction &I : *RewrittenBB)
-    if (CallInst *CI = dyn_cast<CallInst>(&I))
-      if (Region.ExtractedFunction == CI->getCalledFunction()) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (Region.ExtractedFunction == CI->getCalledFunction())
         Region.Call = CI;
-        break;
-      }
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(&I))
+      updateOutputMapping(Region, Outputs.getArrayRef(), LI);
   Region.reattachCandidate();
   return true;
 }
 
 unsigned IROutliner::doOutline(Module &M) {
-  // Find the possibile similarity sections.
+  // Find the possible similarity sections.
   IRSimilarityIdentifier &Identifier = getIRSI(M);
   SimilarityGroupList &SimilarityCandidates = *Identifier.getSimilarity();
 
@@ -886,6 +1228,15 @@ unsigned IROutliner::doOutline(Module &M) {
 
     CurrentGroup.Regions = std::move(OutlinedRegions);
 
+    if (CurrentGroup.Regions.empty())
+      continue;
+
+    // We are adding an extracted argument to decide between which output path
+    // to use in the basic block.  It is used in a switch statement and only
+    // needs to be an integer.
+    if (CurrentGroup.ArgumentTypes.size() > CurrentGroup.NumAggregateInputs)
+      CurrentGroup.ArgumentTypes.push_back(Type::getInt32Ty(M.getContext()));
+
     // Create functions out of all the sections, and mark them as outlined.
     OutlinedRegions.clear();
     for (OutlinableRegion *OS : CurrentGroup.Regions) {

diff  --git a/llvm/test/Transforms/IROutliner/extraction.ll b/llvm/test/Transforms/IROutliner/extraction.ll
index 2db261ba0845..22d7aa54228b 100644
--- a/llvm/test/Transforms/IROutliner/extraction.ll
+++ b/llvm/test/Transforms/IROutliner/extraction.ll
@@ -10,7 +10,7 @@ define void @extract1() {
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[C]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -32,7 +32,7 @@ define void @extract2() {
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[C]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -53,19 +53,23 @@ entry:
 define void @extract_outs1() #0 {
 ; CHECK-LABEL: @extract_outs1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 2, i32* [[A]], align 4
-; CHECK-NEXT:    store i32 3, i32* [[B]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[OUTPUT]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[OUTPUT]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[OUTPUT]], align 4
-; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[TMP2]], i32 [[ADD]], i32* [[RESULT]])
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 0)
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    call void @outlined_ir_func_2(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -91,18 +95,22 @@ entry:
 define void @extract_outs2() #0 {
 ; CHECK-LABEL: @extract_outs2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 2, i32* [[A]], align 4
-; CHECK-NEXT:    store i32 3, i32* [[B]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    store i32 [[ADD]], i32* [[OUTPUT]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[OUTPUT]], align 4
-; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[TMP2]], i32 [[ADD]], i32* [[RESULT]])
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 1)
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_2(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
 ; CHECK-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/IROutliner/illegal-assumes.ll b/llvm/test/Transforms/IROutliner/illegal-assumes.ll
index a40162216aee..e36d852a91e0 100644
--- a/llvm/test/Transforms/IROutliner/illegal-assumes.ll
+++ b/llvm/test/Transforms/IROutliner/illegal-assumes.ll
@@ -7,15 +7,19 @@
 define void @outline_assumes() {
 ; CHECK-LABEL: @outline_assumes(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DL_LOC:%.*]] = alloca i1, align 1
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[D:%.*]] = alloca i1, align 4
-; CHECK-NEXT:    store i1 true, i1* [[D]], align 4
-; CHECK-NEXT:    [[DL:%.*]] = load i1, i1* [[D]], align 1
-; CHECK-NEXT:    [[SPLIT_INST:%.*]] = sub i1 [[DL]], [[DL]]
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i1* [[DL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_3(i1 true, i1* [[D]], i1* [[DL_LOC]], i32 0)
+; CHECK-NEXT:    [[DL_RELOAD:%.*]] = load i1, i1* [[DL_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[SPLIT_INST:%.*]] = sub i1 [[DL_RELOAD]], [[DL_RELOAD]]
 ; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]])
-; CHECK-NEXT:    call void @llvm.assume(i1 [[DL]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[DL_RELOAD]])
 ; CHECK-NEXT:    call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[C]])
 ; CHECK-NEXT:    ret void
 ;
@@ -40,14 +44,18 @@ entry:
 define void @outline_assumes2() {
 ; CHECK-LABEL: @outline_assumes2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DL_LOC:%.*]] = alloca i1, align 1
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[D:%.*]] = alloca i1, align 4
-; CHECK-NEXT:    store i1 false, i1* [[D]], align 4
-; CHECK-NEXT:    [[DL:%.*]] = load i1, i1* [[D]], align 1
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i1* [[DL_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_3(i1 false, i1* [[D]], i1* [[DL_LOC]], i32 1)
+; CHECK-NEXT:    [[DL_RELOAD:%.*]] = load i1, i1* [[DL_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
 ; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]])
-; CHECK-NEXT:    call void @llvm.assume(i1 [[DL]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[DL_RELOAD]])
 ; CHECK-NEXT:    call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[C]])
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/IROutliner/illegal-memcpy.ll b/llvm/test/Transforms/IROutliner/illegal-memcpy.ll
index ebae28dee89e..e59de1e5c99b 100644
--- a/llvm/test/Transforms/IROutliner/illegal-memcpy.ll
+++ b/llvm/test/Transforms/IROutliner/illegal-memcpy.ll
@@ -9,12 +9,22 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture r
 define i8 @function1(i8* noalias %s, i8* noalias %d, i64 %len) {
 ; CHECK-LABEL: @function1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = load i8, i8* [[S:%.*]], align 1
-; CHECK-NEXT:    [[B:%.*]] = load i8, i8* [[D:%.*]], align 1
+; CHECK-NEXT:    [[B_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[A_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[RET_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[B_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i8* [[S:%.*]], i8* [[D:%.*]], i8* [[A_LOC]], i8* [[B_LOC]], i32 0)
+; CHECK-NEXT:    [[A_RELOAD:%.*]] = load i8, i8* [[A_LOC]], align 1
+; CHECK-NEXT:    [[B_RELOAD:%.*]] = load i8, i8* [[B_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[B_LOC]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[D]], i8* [[S]], i64 [[LEN:%.*]], i1 false)
-; CHECK-NEXT:    [[C:%.*]] = add i8 [[A]], [[B]]
-; CHECK-NEXT:    [[RET:%.*]] = load i8, i8* [[S]], align 1
-; CHECK-NEXT:    ret i8 [[RET]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i8 [[A_RELOAD]], i8 [[B_RELOAD]], i8* [[S]], i8* [[RET_LOC]], i32 0)
+; CHECK-NEXT:    [[RET_RELOAD:%.*]] = load i8, i8* [[RET_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    ret i8 [[RET_RELOAD]]
 ;
 entry:
   %a = load i8, i8* %s
@@ -28,12 +38,22 @@ entry:
 define i8 @function2(i8* noalias %s, i8* noalias %d, i64 %len) {
 ; CHECK-LABEL: @function2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = load i8, i8* [[S:%.*]], align 1
-; CHECK-NEXT:    [[B:%.*]] = load i8, i8* [[D:%.*]], align 1
+; CHECK-NEXT:    [[B_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[A_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[RET_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[B_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i8* [[S:%.*]], i8* [[D:%.*]], i8* [[A_LOC]], i8* [[B_LOC]], i32 1)
+; CHECK-NEXT:    [[A_RELOAD:%.*]] = load i8, i8* [[A_LOC]], align 1
+; CHECK-NEXT:    [[B_RELOAD:%.*]] = load i8, i8* [[B_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[B_LOC]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[D]], i8* [[S]], i64 [[LEN:%.*]], i1 false)
-; CHECK-NEXT:    [[C:%.*]] = add i8 [[A]], [[B]]
-; CHECK-NEXT:    [[RET:%.*]] = load i8, i8* [[S]], align 1
-; CHECK-NEXT:    ret i8 [[RET]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i8 [[A_RELOAD]], i8 [[B_RELOAD]], i8* [[S]], i8* [[RET_LOC]], i32 1)
+; CHECK-NEXT:    [[RET_RELOAD:%.*]] = load i8, i8* [[RET_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    ret i8 [[RET_RELOAD]]
 ;
 entry:
   %a = load i8, i8* %s

diff  --git a/llvm/test/Transforms/IROutliner/illegal-memmove.ll b/llvm/test/Transforms/IROutliner/illegal-memmove.ll
index 740fcfbda256..aa2863b24f30 100644
--- a/llvm/test/Transforms/IROutliner/illegal-memmove.ll
+++ b/llvm/test/Transforms/IROutliner/illegal-memmove.ll
@@ -9,12 +9,22 @@ declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture
 define i8 @function1(i8* noalias %s, i8* noalias %d, i64 %len) {
 ; CHECK-LABEL: @function1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = load i8, i8* [[S:%.*]], align 1
-; CHECK-NEXT:    [[B:%.*]] = load i8, i8* [[D:%.*]], align 1
+; CHECK-NEXT:    [[B_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[A_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[RET_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[B_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i8* [[S:%.*]], i8* [[D:%.*]], i8* [[A_LOC]], i8* [[B_LOC]], i32 0)
+; CHECK-NEXT:    [[A_RELOAD:%.*]] = load i8, i8* [[A_LOC]], align 1
+; CHECK-NEXT:    [[B_RELOAD:%.*]] = load i8, i8* [[B_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[B_LOC]])
 ; CHECK-NEXT:    call void @llvm.memmove.p0i8.p0i8.i64(i8* [[D]], i8* [[S]], i64 [[LEN:%.*]], i1 false)
-; CHECK-NEXT:    [[C:%.*]] = add i8 [[A]], [[B]]
-; CHECK-NEXT:    [[RET:%.*]] = load i8, i8* [[S]], align 1
-; CHECK-NEXT:    ret i8 [[RET]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i8 [[A_RELOAD]], i8 [[B_RELOAD]], i8* [[S]], i8* [[RET_LOC]], i32 0)
+; CHECK-NEXT:    [[RET_RELOAD:%.*]] = load i8, i8* [[RET_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    ret i8 [[RET_RELOAD]]
 ;
 entry:
   %a = load i8, i8* %s
@@ -28,12 +38,22 @@ entry:
 define i8 @function2(i8* noalias %s, i8* noalias %d, i64 %len) {
 ; CHECK-LABEL: @function2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = load i8, i8* [[S:%.*]], align 1
-; CHECK-NEXT:    [[B:%.*]] = load i8, i8* [[D:%.*]], align 1
+; CHECK-NEXT:    [[B_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[A_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[RET_LOC:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[B_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i8* [[S:%.*]], i8* [[D:%.*]], i8* [[A_LOC]], i8* [[B_LOC]], i32 1)
+; CHECK-NEXT:    [[A_RELOAD:%.*]] = load i8, i8* [[A_LOC]], align 1
+; CHECK-NEXT:    [[B_RELOAD:%.*]] = load i8, i8* [[B_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[A_LOC]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[B_LOC]])
 ; CHECK-NEXT:    call void @llvm.memmove.p0i8.p0i8.i64(i8* [[D]], i8* [[S]], i64 [[LEN:%.*]], i1 false)
-; CHECK-NEXT:    [[C:%.*]] = add i8 [[A]], [[B]]
-; CHECK-NEXT:    [[RET:%.*]] = load i8, i8* [[S]], align 1
-; CHECK-NEXT:    ret i8 [[RET]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i8 [[A_RELOAD]], i8 [[B_RELOAD]], i8* [[S]], i8* [[RET_LOC]], i32 1)
+; CHECK-NEXT:    [[RET_RELOAD:%.*]] = load i8, i8* [[RET_LOC]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[RET_LOC]])
+; CHECK-NEXT:    ret i8 [[RET_RELOAD]]
 ;
 entry:
   %a = load i8, i8* %s

diff  --git a/llvm/test/Transforms/IROutliner/illegal-vaarg.ll b/llvm/test/Transforms/IROutliner/illegal-vaarg.ll
index fdf03d41e3fd..100239a61f84 100644
--- a/llvm/test/Transforms/IROutliner/illegal-vaarg.ll
+++ b/llvm/test/Transforms/IROutliner/illegal-vaarg.ll
@@ -11,17 +11,20 @@ declare void @llvm.va_end(i8*)
 define i32 @func1(i32 %a, double %b, i8* %v, ...) nounwind {
 ; CHECK-LABEL: @func1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AP1_LOC:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
 ; CHECK-NEXT:    [[AP:%.*]] = alloca i8*, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
-; CHECK-NEXT:    store double [[B:%.*]], double* [[B_ADDR]], align 8
-; CHECK-NEXT:    [[AP1:%.*]] = bitcast i8** [[AP]] to i8*
-; CHECK-NEXT:    call void @llvm.va_start(i8* [[AP1]])
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i8** [[AP1_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32 [[A:%.*]], i32* [[A_ADDR]], double [[B:%.*]], double* [[B_ADDR]], i8** [[AP]], i8** [[AP1_LOC]], i32 0)
+; CHECK-NEXT:    [[AP1_RELOAD:%.*]] = load i8*, i8** [[AP1_LOC]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.va_start(i8* [[AP1_RELOAD]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = va_arg i8** [[AP]], i32
-; CHECK-NEXT:    call void @llvm.va_copy(i8* [[V:%.*]], i8* [[AP1]])
-; CHECK-NEXT:    call void @llvm.va_end(i8* [[AP1]])
+; CHECK-NEXT:    call void @llvm.va_copy(i8* [[V:%.*]], i8* [[AP1_RELOAD]])
+; CHECK-NEXT:    call void @llvm.va_end(i8* [[AP1_RELOAD]])
 ; CHECK-NEXT:    store i32 [[TMP0]], i32* [[C]], align 4
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[C]], align 4
 ; CHECK-NEXT:    ret i32 [[TMP]]
@@ -46,17 +49,20 @@ entry:
 define i32 @func2(i32 %a, double %b, i8* %v, ...) nounwind {
 ; CHECK-LABEL: @func2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AP1_LOC:%.*]] = alloca i8*, align 8
 ; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
 ; CHECK-NEXT:    [[AP:%.*]] = alloca i8*, align 4
 ; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store i32 [[A:%.*]], i32* [[A_ADDR]], align 4
-; CHECK-NEXT:    store double [[B:%.*]], double* [[B_ADDR]], align 8
-; CHECK-NEXT:    [[AP1:%.*]] = bitcast i8** [[AP]] to i8*
-; CHECK-NEXT:    call void @llvm.va_start(i8* [[AP1]])
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i8** [[AP1_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32 [[A:%.*]], i32* [[A_ADDR]], double [[B:%.*]], double* [[B_ADDR]], i8** [[AP]], i8** [[AP1_LOC]], i32 1)
+; CHECK-NEXT:    [[AP1_RELOAD:%.*]] = load i8*, i8** [[AP1_LOC]], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.va_start(i8* [[AP1_RELOAD]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = va_arg i8** [[AP]], i32
-; CHECK-NEXT:    call void @llvm.va_copy(i8* [[V:%.*]], i8* [[AP1]])
-; CHECK-NEXT:    call void @llvm.va_end(i8* [[AP1]])
+; CHECK-NEXT:    call void @llvm.va_copy(i8* [[V:%.*]], i8* [[AP1_RELOAD]])
+; CHECK-NEXT:    call void @llvm.va_end(i8* [[AP1_RELOAD]])
 ; CHECK-NEXT:    store i32 [[TMP0]], i32* [[C]], align 4
 ; CHECK-NEXT:    [[AP2:%.*]] = bitcast i8** [[AP]] to i8*
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[C]], align 4

diff  --git a/llvm/test/Transforms/IROutliner/outlining-
diff erent-output-blocks.ll b/llvm/test/Transforms/IROutliner/outlining-
diff erent-output-blocks.ll
new file mode 100644
index 000000000000..ddff51e8d115
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/outlining-
diff erent-output-blocks.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -iroutliner < %s | FileCheck %s
+
+; These functions are constructed slightly 
diff erently so that they require
+; 
diff erent output blocks for the values used outside of the region. We are
+; checking that two output blocks are created with 
diff erent values.
+
+define void @outline_outputs1() #0 {
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 0)
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  %sub = sub i32 %0, %1
+  store i32 %add, i32* %output, align 4
+  %2 = load i32, i32* %output, align 4
+  %3 = load i32, i32* %output, align 4
+  %mul = mul i32 %2, %add
+  store i32 %mul, i32* %result, align 4
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[SUB_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[SUB_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[SUB_LOC]], i32* [[DOTLOC]], i32 1)
+; CHECK-NEXT:    [[SUB_RELOAD:%.*]] = load i32, i32* [[SUB_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[SUB_RELOAD]], i32* [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  %sub = sub i32 %0, %1
+  store i32 %add, i32* %output, align 4
+  %2 = load i32, i32* %output, align 4
+  %mul = mul i32 %2, %sub
+  store i32 %mul, i32* %result, align 4
+  ret void
+}
+
+; CHECK: define internal void @outlined_ir_func_0(i32* [[ARG0:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]], i32* [[ARG3:%.*]], i32* [[ARG4:%.*]], i32 [[ARG5:%.*]]) #1 {
+; CHECK: _after_outline.exitStub:
+; CHECK-NEXT:    switch i32 [[ARG5]], label [[BLOCK:%.*]] [
+; CHECK-NEXT:      i32 0, label %[[BLOCK_0:.*]]
+; CHECK-NEXT:      i32 1, label %[[BLOCK_1:.*]]
+
+; CHECK: entry_to_outline:
+; CHECK-NEXT:    store i32 2, i32* [[ARG0]], align 4
+; CHECK-NEXT:    store i32 3, i32* [[ARG1]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARG0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARG1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARG2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARG2]], align 4
+
+; CHECK: [[BLOCK_0]]:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARG3]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARG4]], align 4
+
+; CHECK: [[BLOCK_1]]:
+; CHECK-NEXT:    store i32 [[SUB]], i32* [[ARG3]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARG4]], align 4

diff  --git a/llvm/test/Transforms/IROutliner/outlining-remapped-outputs.ll b/llvm/test/Transforms/IROutliner/outlining-remapped-outputs.ll
new file mode 100644
index 000000000000..b9ab33721a1f
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/outlining-remapped-outputs.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -iroutliner < %s | FileCheck %s
+
+; This test tests that inputs that are replaced with the output of an outlined
+; function is still recognized as the same value.
+
+define void @outline_outputs1() #0 {
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD2_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32 2, i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 0)
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
+; CHECK-NEXT:    br label [[NEXT:%.*]]
+; CHECK:       next:
+; CHECK-NEXT:    [[LT_CAST4:%.*]] = bitcast i32* [[ADD2_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST4]])
+; CHECK-NEXT:    [[LT_CAST5:%.*]] = bitcast i32* [[DOTLOC2]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST5]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32 [[ADD_RELOAD]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[OUTPUT2]], i32* [[ADD2_LOC]], i32* [[DOTLOC2]], i32 1)
+; CHECK-NEXT:    [[ADD2_RELOAD:%.*]] = load i32, i32* [[ADD2_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD3:%.*]] = load i32, i32* [[DOTLOC2]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST4]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST5]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[DOTRELOAD3]], i32 [[ADD2_RELOAD]], i32* [[RESULT2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  store i32 %add, i32* %output, align 4
+  %2 = load i32, i32* %output, align 4
+  %3 = load i32, i32* %output, align 4
+  %mul = mul i32 %2, %add
+  store i32 %mul, i32* %result, align 4
+  br label %next
+next:
+  store i32 %add, i32* %output, align 4
+  store i32 3, i32* %result, align 4
+  %4 = load i32, i32* %output, align 4
+  %5 = load i32, i32* %result, align 4
+  %add2 = add i32 %4, %5
+  store i32 %add2, i32* %output2, align 4
+  %6 = load i32, i32* %output2, align 4
+  %mul2 = mul i32 %6, %add2
+  store i32 %mul2, i32* %result2, align 4
+  ret void
+}

diff  --git a/llvm/test/Transforms/IROutliner/outlining-same-output-blocks.ll b/llvm/test/Transforms/IROutliner/outlining-same-output-blocks.ll
new file mode 100644
index 000000000000..f4ddfcefdf7e
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/outlining-same-output-blocks.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -iroutliner < %s | FileCheck %s
+
+; These functions are constructed slightly 
diff erently so that they require
+; the same output blocks for the values used outside of the region. We are
+; checking that two output blocks are created with the same store instructions.
+
+define void @outline_outputs1() #0 {
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 0)
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  store i32 %add, i32* %output, align 4
+  %2 = load i32, i32* %output, align 4
+  %3 = load i32, i32* %output, align 4
+  %mul = mul i32 %2, %add
+  store i32 %mul, i32* %result, align 4
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]], i32 1)
+; CHECK-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; CHECK-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  store i32 %add, i32* %output, align 4
+  %2 = load i32, i32* %output, align 4
+  %mul = mul i32 %2, %add
+  store i32 %mul, i32* %result, align 4
+  ret void
+}
+
+; CHECK: define internal void @outlined_ir_func_0(i32* [[ARG0:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]], i32* [[ARG3:%.*]], i32* [[ARG4:%.*]], i32 [[ARG5:%.*]]) #1 {
+; CHECK: entry_after_outline.exitStub:
+; CHECK-NEXT:    switch i32 [[ARG5]], label [[BLOCK:%.*]] [
+; CHECK-NEXT:      i32 0, label %[[BLOCK_0:.*]]
+; CHECK-NEXT:      i32 1, label %[[BLOCK_1:.*]]
+
+; CHECK: entry_to_outline:
+; CHECK-NEXT:    store i32 2, i32* [[ARG0]], align 4
+; CHECK-NEXT:    store i32 3, i32* [[ARG1]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARG0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARG1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARG2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[ARG2]], align 4
+
+; CHECK: [[BLOCK_0]]:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARG3]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARG4]], align 4
+
+; CHECK: [[BLOCK_1]]:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARG3]], align 4
+; CHECK-NEXT:    store i32 [[TMP2]], i32* [[ARG4]], align 4