[llvm] dcc3e72 - [IROutliner] Allowing Phi Nodes in exit blocks

Andrew Litteken via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 25 09:34:18 PST 2022


Author: Andrew Litteken
Date: 2022-01-25T11:33:53-06:00
New Revision: dcc3e728ca018de785991d4ecb9efe4f6a18ca75

URL: https://github.com/llvm/llvm-project/commit/dcc3e728ca018de785991d4ecb9efe4f6a18ca75
DIFF: https://github.com/llvm/llvm-project/commit/dcc3e728ca018de785991d4ecb9efe4f6a18ca75.diff

LOG: [IROutliner] Allowing Phi Nodes in exit blocks

In addition to having multiple exit locations, there can be multiple blocks leading to the same exit location, which results in a potential phi node. If we find that multiple blocks within the region branch to the same block outside the region, resulting in a phi node, the code extractor pulls this phi node into the function and uses it as an output.

We make sure that this phi node is given an output slot, and that the two values are removed from the outputs if they are not used anywhere else outside of the region. Across the extracted regions, the phi nodes are combined into a single block for each potential output block, similar to the previous patch.

Reviewers: paquette

Differential Revision: https://reviews.llvm.org/D106995

Added: 
    llvm/test/Transforms/IROutliner/gvn-output-set-overload.ll
    llvm/test/Transforms/IROutliner/mismatched-phi-exits-not-in-first-outlined.ll
    llvm/test/Transforms/IROutliner/mismatched-phi-exits.ll
    llvm/test/Transforms/IROutliner/mismatched-phi-outputs-ordering.ll
    llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll
    llvm/test/Transforms/IROutliner/phi-nodes-output-overload.ll
    llvm/test/Transforms/IROutliner/region-inputs-in-phi-nodes.ll

Modified: 
    llvm/include/llvm/Transforms/IPO/IROutliner.h
    llvm/lib/Transforms/IPO/IROutliner.cpp
    llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/IPO/IROutliner.h b/llvm/include/llvm/Transforms/IPO/IROutliner.h
index 110c0b4dcf16b..dcae4454f8281 100644
--- a/llvm/include/llvm/Transforms/IPO/IROutliner.h
+++ b/llvm/include/llvm/Transforms/IPO/IROutliner.h
@@ -95,6 +95,10 @@ struct OutlinableRegion {
   /// required for the following basic blocks in this case.
   bool EndsInBranch = false;
 
+  /// The PHIBlocks with their corresponding return block based on the return
+  /// value as the key.
+  DenseMap<Value *, BasicBlock *> PHIBlocks;
+
   /// Mapping of the argument number in the deduplicated function
   /// to a given constant, which is used when creating the arguments to the call
   /// to the newly created deduplicated function.  This is handled separately
@@ -182,7 +186,14 @@ class IROutliner {
   IROutliner(function_ref<TargetTransformInfo &(Function &)> GTTI,
              function_ref<IRSimilarityIdentifier &(Module &)> GIRSI,
              function_ref<OptimizationRemarkEmitter &(Function &)> GORE)
-      : getTTI(GTTI), getIRSI(GIRSI), getORE(GORE) {}
+      : getTTI(GTTI), getIRSI(GIRSI), getORE(GORE) {
+    
+    // Check that the DenseMap implementation has not changed.
+    assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 &&
+           "DenseMapInfo<unsigned>'s empty key isn't -1!");
+    assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 &&
+           "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
+  }
   bool run(Module &M);
 
 private:

diff  --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index 928b12013a545..c3104111864a4 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -106,6 +106,16 @@ struct OutlinableGroup {
   /// of the region.
   unsigned BranchesToOutside = 0;
 
+  /// Tracker counting backwards from the highest unsigned value possible to
+  /// avoid conflicting with the GVNs of assigned values.  We start at -3 since
+  /// -2 and -1 are assigned by the DenseMap.
+  unsigned PHINodeGVNTracker = -3;
+
+  DenseMap<unsigned,
+           std::pair<std::pair<unsigned, unsigned>, SmallVector<unsigned, 2>>>
+      PHINodeGVNToGVNs;
+  DenseMap<hash_code, unsigned> GVNsToPHINodeGVN;
+
   /// The number of instructions that will be outlined by extracting \ref
   /// Regions.
   InstructionCost Benefit = 0;
@@ -356,6 +366,24 @@ InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) {
   return Benefit;
 }
 
+/// Check the \p OutputMappings structure for value \p Input, if it exists
+/// it has been used as an output for outlining, and has been renamed, and we
+/// return the new value, otherwise, we return the same value.
+///
+/// \param OutputMappings [in] - The mapping of values to their renamed value
+/// after being used as an output for an outlined region.
+/// \param Input [in] - The value to find the remapped value of, if it exists.
+/// \return The remapped value if it has been renamed, and the same value if has
+/// not.
+static Value *findOutputMapping(const DenseMap<Value *, Value *> OutputMappings,
+                                Value *Input) {
+  DenseMap<Value *, Value *>::const_iterator OutputMapping =
+      OutputMappings.find(Input);
+  if (OutputMapping != OutputMappings.end())
+    return OutputMapping->second;
+  return Input;
+}
+
 /// Find whether \p Region matches the global value numbering to Constant
 /// mapping found so far.
 ///
@@ -832,6 +860,209 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
   Region.NumExtractedInputs = OriginalIndex;
 }
 
+/// Check if the \p V has any uses outside of the region other than \p PN.
+///
+/// \param V [in] - The value to check.
+/// \param PHILoc [in] - The location in the PHINode of \p V.
+/// \param PN [in] - The PHINode using \p V.
+/// \param Exits [in] - The potential blocks we exit to from the outlined
+/// region.
+/// \param BlocksInRegion [in] - The basic blocks contained in the region.
+/// \returns true if \p V has any use soutside its region other than \p PN.
+static bool outputHasNonPHI(Value *V, unsigned PHILoc, PHINode &PN,
+                            SmallPtrSet<BasicBlock *, 1> &Exits,
+                            DenseSet<BasicBlock *> &BlocksInRegion) {
+  // We check to see if the value is used by the PHINode from some other
+  // predecessor not included in the region.  If it is, we make sure
+  // to keep it as an output.
+  SmallVector<unsigned, 2> IncomingNumbers(PN.getNumIncomingValues());
+  std::iota(IncomingNumbers.begin(), IncomingNumbers.end(), 0);
+  if (any_of(IncomingNumbers, [PHILoc, &PN, V, &BlocksInRegion](unsigned Idx) {
+        return (Idx != PHILoc && V == PN.getIncomingValue(Idx) &&
+                !BlocksInRegion.contains(PN.getIncomingBlock(Idx)));
+      }))
+    return true;
+
+  // Check if the value is used by any other instructions outside the region.
+  return any_of(V->users(), [&Exits, &BlocksInRegion](User *U) {
+    Instruction *I = dyn_cast<Instruction>(U);
+    if (!I)
+      return false;
+
+    // If the use of the item is inside the region, we skip it.  Uses
+    // inside the region give us useful information about how the item could be
+    // used as an output.
+    BasicBlock *Parent = I->getParent();
+    if (BlocksInRegion.contains(Parent))
+      return false;
+
+    // If it's not a PHINode then we definitely know the use matters.  This
+    // output value will not completely combined with another item in a PHINode
+    // as it is directly reference by another non-phi instruction
+    if (!isa<PHINode>(I))
+      return true;
+
+    // If we have a PHINode outside one of the exit locations, then it
+    // can be considered an outside use as well.  If there is a PHINode
+    // contained in the Exit where this values use matters, it will be
+    // caught when we analyze that PHINode.
+    if (!Exits.contains(Parent))
+      return true;
+
+    return false;
+  });
+}
+
+/// Test whether \p CurrentExitFromRegion contains any PhiNodes that should be
+/// considered outputs. A PHINodes is an output when more than one incoming
+/// value has been marked by the CodeExtractor as an output.
+///
+/// \param CurrentExitFromRegion [in] - The block to analyze.
+/// \param PotentialExitsFromRegion [in] - The potential exit blocks from the
+/// region.
+/// \param RegionBlocks [in] - The basic blocks in the region.
+/// \param Outputs [in, out] - The existing outputs for the region, we may add
+/// PHINodes to this as we find that they replace output values.
+/// \param OutputsReplacedByPHINode [out] - A set containing outputs that are
+/// totally replaced  by a PHINode.
+/// \param OutputsWithNonPhiUses [out] - A set containing outputs that are used
+/// in PHINodes, but have other uses, and should still be considered outputs.
+static void analyzeExitPHIsForOutputUses(
+    BasicBlock *CurrentExitFromRegion,
+    SmallPtrSet<BasicBlock *, 1> &PotentialExitsFromRegion,
+    DenseSet<BasicBlock *> &RegionBlocks, SetVector<Value *> &Outputs,
+    DenseSet<Value *> &OutputsReplacedByPHINode,
+    DenseSet<Value *> &OutputsWithNonPhiUses) {
+  for (PHINode &PN : CurrentExitFromRegion->phis()) {
+    // Find all incoming values from the outlining region.
+    SmallVector<unsigned, 2> IncomingVals;
+    for (unsigned I = 0, E = PN.getNumIncomingValues(); I < E; ++I)
+      if (RegionBlocks.contains(PN.getIncomingBlock(I)))
+        IncomingVals.push_back(I);
+
+    // Do not process PHI if there are no predecessors from region.
+    unsigned NumIncomingVals = IncomingVals.size();
+    if (NumIncomingVals == 0)
+      continue;
+
+    // If there is one predecessor, we mark it as a value that needs to be kept
+    // as an output.
+    if (NumIncomingVals == 1) {
+      Value *V = PN.getIncomingValue(*IncomingVals.begin());
+      OutputsWithNonPhiUses.insert(V);
+      OutputsReplacedByPHINode.erase(V);
+      continue;
+    }
+
+    // This PHINode will be used as an output value, so we add it to our list.
+    Outputs.insert(&PN);
+
+    // Not all of the incoming values should be ignored as other inputs and
+    // outputs may have uses in outlined region.  If they have other uses
+    // outside of the single PHINode we should not skip over it.
+    for (unsigned Idx : IncomingVals) {
+      Value *V = PN.getIncomingValue(Idx);
+      if (outputHasNonPHI(V, Idx, PN, PotentialExitsFromRegion, RegionBlocks)) {
+        OutputsWithNonPhiUses.insert(V);
+        OutputsReplacedByPHINode.erase(V);
+        continue;
+      }
+      if (!OutputsWithNonPhiUses.contains(V))
+        OutputsReplacedByPHINode.insert(V);
+    }
+  }
+}
+
+// Represents the type for the unsigned number denoting the output number for
+// phi node, along with the canonical number for the exit block.
+using ArgLocWithBBCanon = std::pair<unsigned, unsigned>;
+// The list of canonical numbers for the incoming values to a PHINode.
+using CanonList = SmallVector<unsigned, 2>;
+// The pair type representing the set of canonical values being combined in the
+// PHINode, along with the location data for the PHINode.
+using PHINodeData = std::pair<ArgLocWithBBCanon, CanonList>;
+
+/// Encode \p PND as an integer for easy lookup based on the argument location,
+/// the parent BasicBlock canonical numbering, and the canonical numbering of
+/// the values stored in the PHINode.
+///
+/// \param PND - The data to hash.
+/// \returns The hash code of \p PND.
+static hash_code encodePHINodeData(PHINodeData &PND) {
+  return llvm::hash_combine(
+      llvm::hash_value(PND.first.first), llvm::hash_value(PND.first.second),
+      llvm::hash_combine_range(PND.second.begin(), PND.second.end()));
+}
+
+/// Create a special GVN for PHINodes that will be used outside of
+/// the region.  We create a hash code based on the Canonical number of the
+/// parent BasicBlock, the canonical numbering of the values stored in the
+/// PHINode and the aggregate argument location.  This is used to find whether
+/// this PHINode type has been given a canonical numbering already.  If not, we
+/// assign it a value and store it for later use.  The value is returned to
+/// identify 
diff erent output schemes for the set of regions.
+///
+/// \param Region - The region that \p PN is an output for.
+/// \param PN - The PHINode we are analyzing.
+/// \param AggArgIdx - The argument \p PN will be stored into.
+/// \returns An optional holding the assigned canonical number, or None if
+/// there is some attribute of the PHINode blocking it from being used.
+static Optional<unsigned> getGVNForPHINode(OutlinableRegion &Region,
+                                           PHINode *PN, unsigned AggArgIdx) {
+  OutlinableGroup &Group = *Region.Parent;
+  IRSimilarityCandidate &Cand = *Region.Candidate;
+  BasicBlock *PHIBB = PN->getParent();
+  CanonList PHIGVNs;
+  for (Value *Incoming : PN->incoming_values()) {
+    // If we cannot find a GVN, this means that the input to the PHINode is
+    // not included in the region we are trying to analyze, meaning, that if
+    // it was outlined, we would be adding an extra input.  We ignore this
+    // case for now, and so ignore the region.
+    Optional<unsigned> OGVN = Cand.getGVN(Incoming);
+    if (!OGVN.hasValue()) {
+      Region.IgnoreRegion = true;
+      return None;
+    }
+
+    // Collect the canonical numbers of the values in the PHINode.
+    unsigned GVN = OGVN.getValue();
+    OGVN = Cand.getCanonicalNum(GVN);
+    assert(OGVN.hasValue() && "No GVN found for incoming value?");
+    PHIGVNs.push_back(*OGVN);
+  }
+
+  // Now that we have the GVNs for the incoming values, we are going to combine
+  // them with the GVN of the incoming bock, and the output location of the
+  // PHINode to generate a hash value representing this instance of the PHINode.
+  DenseMap<hash_code, unsigned>::iterator GVNToPHIIt;
+  DenseMap<unsigned, PHINodeData>::iterator PHIToGVNIt;
+  Optional<unsigned> BBGVN = Cand.getGVN(PHIBB);
+  assert(BBGVN.hasValue() && "Could not find GVN for the incoming block!");
+
+  BBGVN = Cand.getCanonicalNum(BBGVN.getValue());
+  assert(BBGVN.hasValue() &&
+         "Could not find canonical number for the incoming block!");
+  // Create a pair of the exit block canonical value, and the aggregate
+  // argument location, connected to the canonical numbers stored in the
+  // PHINode.
+  PHINodeData TemporaryPair =
+      std::make_pair(std::make_pair(BBGVN.getValue(), AggArgIdx), PHIGVNs);
+  hash_code PHINodeDataHash = encodePHINodeData(TemporaryPair);
+
+  // Look for and create a new entry in our connection between canonical
+  // numbers for PHINodes, and the set of objects we just created.
+  GVNToPHIIt = Group.GVNsToPHINodeGVN.find(PHINodeDataHash);
+  if (GVNToPHIIt == Group.GVNsToPHINodeGVN.end()) {
+    bool Inserted = false;
+    std::tie(PHIToGVNIt, Inserted) = Group.PHINodeGVNToGVNs.insert(
+        std::make_pair(Group.PHINodeGVNTracker, TemporaryPair));
+    std::tie(GVNToPHIIt, Inserted) = Group.GVNsToPHINodeGVN.insert(
+        std::make_pair(PHINodeDataHash, Group.PHINodeGVNTracker--));
+  }
+
+  return GVNToPHIIt->second;
+}
+
 /// Create a mapping of the output arguments for the \p Region to the output
 /// arguments of the overall outlined function.
 ///
@@ -844,35 +1075,25 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
   IRSimilarityCandidate &C = *Region.Candidate;
 
   SmallVector<BasicBlock *> BE;
-  DenseSet<BasicBlock *> BBSet;
-  C.getBasicBlocks(BBSet, BE);
+  DenseSet<BasicBlock *> BlocksInRegion;
+  C.getBasicBlocks(BlocksInRegion, BE);
 
   // Find the exits to the region.
   SmallPtrSet<BasicBlock *, 1> Exits;
   for (BasicBlock *Block : BE)
     for (BasicBlock *Succ : successors(Block))
-      if (!BBSet.contains(Succ))
+      if (!BlocksInRegion.contains(Succ))
         Exits.insert(Succ);
 
   // After determining which blocks exit to PHINodes, we add these PHINodes to
   // the set of outputs to be processed.  We also check the incoming values of
   // the PHINodes for whether they should no longer be considered outputs.
-  for (BasicBlock *ExitBB : Exits) {
-    for (PHINode &PN : ExitBB->phis()) {
-      // Find all incoming values from the outlining region.
-      SmallVector<unsigned, 2> IncomingVals;
-      for (unsigned Idx = 0; Idx < PN.getNumIncomingValues(); ++Idx)
-        if (BBSet.contains(PN.getIncomingBlock(Idx)))
-          IncomingVals.push_back(Idx);
-
-      // Do not process PHI if there is one (or fewer) predecessor from region.
-      if (IncomingVals.size() <= 1)
-        continue;
-
-      Region.IgnoreRegion = true;
-      return;
-    }
-  }
+  DenseSet<Value *> OutputsReplacedByPHINode;
+  DenseSet<Value *> OutputsWithNonPhiUses;
+  for (BasicBlock *ExitBB : Exits)
+    analyzeExitPHIsForOutputUses(ExitBB, Exits, BlocksInRegion, Outputs,
+                                 OutputsReplacedByPHINode,
+                                 OutputsWithNonPhiUses);
 
   // This counts the argument number in the extracted function.
   unsigned OriginalIndex = Region.NumExtractedInputs;
@@ -895,9 +1116,13 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
     // do not have to be in same order, but are functionally the same, we will
     // have to use a 
diff erent scheme, as one-to-one correspondence is not
     // guaranteed.
-    unsigned GlobalValue = C.getGVN(Output).getValue();
     unsigned ArgumentSize = Group.ArgumentTypes.size();
 
+    // If the output is combined in a PHINode, we make sure to skip over it.
+    if (OutputsReplacedByPHINode.contains(Output))
+      continue;
+
+    unsigned AggArgIdx = 0;
     for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) {
       if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType()))
         continue;
@@ -909,7 +1134,7 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
       AggArgsUsed.insert(Jdx);
       Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx));
       Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex));
-      Region.GVNStores.push_back(GlobalValue);
+      AggArgIdx = Jdx;
       break;
     }
 
@@ -918,18 +1143,54 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
     // function to handle this output and create a mapping to it.
     if (!TypeFound) {
       Group.ArgumentTypes.push_back(PointerType::getUnqual(Output->getType()));
-      AggArgsUsed.insert(Group.ArgumentTypes.size() - 1);
+      // Mark the new pointer type as the last value in the aggregate argument
+      // list.
+      unsigned ArgTypeIdx = Group.ArgumentTypes.size() - 1;
+      AggArgsUsed.insert(ArgTypeIdx);
       Region.ExtractedArgToAgg.insert(
-          std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1));
+          std::make_pair(OriginalIndex, ArgTypeIdx));
       Region.AggArgToExtracted.insert(
-          std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex));
-      Region.GVNStores.push_back(GlobalValue);
+          std::make_pair(ArgTypeIdx, OriginalIndex));
+      AggArgIdx = ArgTypeIdx;
+    }
+
+    // TODO: Adapt to the extra input from the PHINode.
+    PHINode *PN = dyn_cast<PHINode>(Output);
+
+    Optional<unsigned> GVN;
+    if (PN && !BlocksInRegion.contains(PN->getParent())) {
+      // Values outside the region can be combined into PHINode when we
+      // have multiple exits. We collect both of these into a list to identify
+      // which values are being used in the PHINode. Each list identifies a
+      // 
diff erent PHINode, and a 
diff erent output. We store the PHINode as it's
+      // own canonical value.  These canonical values are also dependent on the
+      // output argument it is saved to.
+
+      // If two PHINodes have the same canonical values, but 
diff erent aggregate
+      // argument locations, then they will have distinct Canonical Values.
+      GVN = getGVNForPHINode(Region, PN, AggArgIdx);
+      if (!GVN.hasValue())
+        return; 
+    } else {
+      // If we do not have a PHINode we use the global value numbering for the
+      // output value, to find the canonical number to add to the set of stored
+      // values.
+      GVN = C.getGVN(Output);
+      GVN = C.getCanonicalNum(*GVN);
     }
 
-    stable_sort(Region.GVNStores);
+    // Each region has a potentially unique set of outputs.  We save which
+    // values are output in a list of canonical values so we can 
diff erentiate
+    // among the 
diff erent store schemes.
+    Region.GVNStores.push_back(*GVN);
+
     OriginalIndex++;
     TypeIndex++;
   }
+
+  // We sort the stored values to make sure that we are not affected by analysis
+  // order when determining what combination of items were stored.
+  stable_sort(Region.GVNStores);
 }
 
 void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region,
@@ -1065,6 +1326,214 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
   return Call;
 }
 
+/// Find or create a BasicBlock in the outlined function containing PhiBlocks
+/// for \p RetVal.
+///
+/// \param Group - The OutlinableGroup containing the information about the
+/// overall outlined function.
+/// \param RetVal - The return value or exit option that we are currently
+/// evaluating.
+/// \returns The found or newly created BasicBlock to contain the needed
+/// PHINodes to be used as outputs.
+static BasicBlock *findOrCreatePHIBlock(OutlinableGroup &Group, Value *RetVal) {
+  DenseMap<Value *, BasicBlock *>::iterator PhiBlockForRetVal,
+      ReturnBlockForRetVal;
+  PhiBlockForRetVal = Group.PHIBlocks.find(RetVal);
+  ReturnBlockForRetVal = Group.EndBBs.find(RetVal);
+  assert(ReturnBlockForRetVal != Group.EndBBs.end() &&
+         "Could not find output value!");
+  BasicBlock *ReturnBB = ReturnBlockForRetVal->second;
+
+  // Find if a PHIBlock exists for this return value already.  If it is
+  // the first time we are analyzing this, we will not, so we record it.
+  PhiBlockForRetVal = Group.PHIBlocks.find(RetVal);
+  if (PhiBlockForRetVal != Group.PHIBlocks.end())
+    return PhiBlockForRetVal->second;
+  
+  // If we did not find a block, we create one, and insert it into the
+  // overall function and record it.
+  bool Inserted = false;
+  BasicBlock *PHIBlock = BasicBlock::Create(ReturnBB->getContext(), "phi_block",
+                                            ReturnBB->getParent());
+  std::tie(PhiBlockForRetVal, Inserted) =
+      Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock));
+
+  // We find the predecessors of the return block in the newly created outlined
+  // function in order to point them to the new PHIBlock rather than the already
+  // existing return block.
+  SmallVector<BranchInst *, 2> BranchesToChange;
+  for (BasicBlock *Pred : predecessors(ReturnBB))
+    BranchesToChange.push_back(cast<BranchInst>(Pred->getTerminator()));
+
+  // Now we mark the branch instructions found, and change the references of the
+  // return block to the newly created PHIBlock.
+  for (BranchInst *BI : BranchesToChange)
+    for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ < End; Succ++) {
+      if (BI->getSuccessor(Succ) != ReturnBB)
+        continue;
+      BI->setSuccessor(Succ, PHIBlock);
+    }
+
+  BranchInst::Create(ReturnBB, PHIBlock);
+
+  return PhiBlockForRetVal->second;
+}
+
+/// For the function call now representing the \p Region, find the passed value
+/// to that call that represents Argument \p A at the call location if the
+/// call has already been replaced with a call to the  overall, aggregate
+/// function.
+///
+/// \param A - The Argument to get the passed value for.
+/// \param Region - The extracted Region corresponding to the outlined function.
+/// \returns The Value representing \p A at the call site.
+static Value *
+getPassedArgumentInAlreadyOutlinedFunction(const Argument *A,
+                                           const OutlinableRegion &Region) {
+  // If we don't need to adjust the argument number at all (since the call
+  // has already been replaced by a call to the overall outlined function)
+  // we can just get the specified argument.
+  return Region.Call->getArgOperand(A->getArgNo());
+}
+
+/// For the function call now representing the \p Region, find the passed value
+/// to that call that represents Argument \p A at the call location if the
+/// call has only been replaced by the call to the aggregate function.
+///
+/// \param A - The Argument to get the passed value for.
+/// \param Region - The extracted Region corresponding to the outlined function.
+/// \returns The Value representing \p A at the call site.
+static Value *
+getPassedArgumentAndAdjustArgumentLocation(const Argument *A,
+                                           const OutlinableRegion &Region) {
+  unsigned ArgNum = A->getArgNo();
+  
+  // If it is a constant, we can look at our mapping from when we created
+  // the outputs to figure out what the constant value is.
+  if (Region.AggArgToConstant.count(ArgNum))
+    return Region.AggArgToConstant.find(ArgNum)->second;
+  
+  // If it is not a constant, and we are not looking at the overall function, we
+  // need to adjust which argument we are looking at.
+  ArgNum = Region.AggArgToExtracted.find(ArgNum)->second;
+  return Region.Call->getArgOperand(ArgNum);
+}
+
+/// Find the canonical numbering for the incoming Values into the PHINode \p PN.
+///
+/// \param PN [in] - The PHINode that we are finding the canonical numbers for.
+/// \param Region [in] - The OutlinableRegion containing \p PN.
+/// \param OutputMappings [in] - The mapping of output values from outlined
+/// region to their original values.
+/// \param CanonNums [out] - The canonical numbering for the incoming values to
+/// \p PN.
+/// \param ReplacedWithOutlinedCall - A flag to use the extracted function call
+/// of \p Region rather than the overall function's call.
+static void
+findCanonNumsForPHI(PHINode *PN, OutlinableRegion &Region,
+                    const DenseMap<Value *, Value *> &OutputMappings,
+                    DenseSet<unsigned> &CanonNums,
+                    bool ReplacedWithOutlinedCall = true) {
+  // Iterate over the incoming values.
+  for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; Idx++) {
+    Value *IVal = PN->getIncomingValue(Idx);
+    // If we have an argument as incoming value, we need to grab the passed
+    // value from the call itself.
+    if (Argument *A = dyn_cast<Argument>(IVal)) {
+      if (ReplacedWithOutlinedCall)
+        IVal = getPassedArgumentInAlreadyOutlinedFunction(A, Region);
+      else
+        IVal = getPassedArgumentAndAdjustArgumentLocation(A, Region);
+    }
+
+    // Get the original value if it has been replaced by an output value.
+    IVal = findOutputMapping(OutputMappings, IVal);
+
+    // Find and add the canonical number for the incoming value.
+    Optional<unsigned> GVN = Region.Candidate->getGVN(IVal);
+    assert(GVN.hasValue() && "No GVN for incoming value");
+    Optional<unsigned> CanonNum = Region.Candidate->getCanonicalNum(*GVN);
+    assert(CanonNum.hasValue() && "No Canonical Number for GVN");
+    CanonNums.insert(*CanonNum);
+  }
+}
+
+/// Find, or add PHINode \p PN to the combined PHINode Block \p OverallPHIBlock
+/// in order to condense the number of instructions added to the outlined
+/// function.
+///
+/// \param PN [in] - The PHINode that we are finding the canonical numbers for.
+/// \param Region [in] - The OutlinableRegion containing \p PN. 
+/// \param OverallPhiBlock [in] - The overall PHIBlock we are trying to find
+/// \p PN in.
+/// \param OutputMappings [in] - The mapping of output values from outlined
+/// region to their original values.
+/// \return the newly found or created PHINode in \p OverallPhiBlock.
+static PHINode*
+findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region,
+                       BasicBlock *OverallPhiBlock,
+                       const DenseMap<Value *, Value *> &OutputMappings) {
+  OutlinableGroup &Group = *Region.Parent;
+  
+  DenseSet<unsigned> PNCanonNums;
+  // We have to use the extracted function since we have merged this region into
+  // the overall function yet.  We make sure to reassign the argument numbering
+  // since it is possible that the argument ordering is 
diff erent between the
+  // functions.
+  findCanonNumsForPHI(&PN, Region, OutputMappings, PNCanonNums,
+                      /* ReplacedWithOutlinedCall = */ false);
+
+  OutlinableRegion *FirstRegion = Group.Regions[0];
+  DenseSet<unsigned> CurrentCanonNums;
+  // Find the Canonical Numbering for each PHINode, if it matches, we replace
+  // the uses of the PHINode we are searching for, with the found PHINode.
+  for (PHINode &CurrPN : OverallPhiBlock->phis()) {
+    CurrentCanonNums.clear();
+    findCanonNumsForPHI(&CurrPN, *FirstRegion, OutputMappings, CurrentCanonNums,
+                        /* ReplacedWithOutlinedCall = */ true);
+
+    if (all_of(PNCanonNums, [&CurrentCanonNums](unsigned CanonNum) {
+          return CurrentCanonNums.contains(CanonNum);
+        }))
+      return &CurrPN;
+  }
+
+  // If we've made it here, it means we weren't able to replace the PHINode, so
+  // we must insert it ourselves.
+  PHINode *NewPN = cast<PHINode>(PN.clone());
+  NewPN->insertBefore(&*OverallPhiBlock->begin());
+  for (unsigned Idx = 0, Edx = NewPN->getNumIncomingValues(); Idx < Edx;
+       Idx++) {
+    Value *IncomingVal = NewPN->getIncomingValue(Idx);
+    BasicBlock *IncomingBlock = NewPN->getIncomingBlock(Idx);
+
+    // Find corresponding basic block in the overall function for the incoming
+    // block.
+    Instruction *FirstNonPHI = IncomingBlock->getFirstNonPHI();
+    assert(FirstNonPHI && "Incoming block is empty?");
+    Value *CorrespondingVal =
+        Region.findCorrespondingValueIn(*FirstRegion, FirstNonPHI);
+    assert(CorrespondingVal && "Value is nullptr?");
+    BasicBlock *BlockToUse = cast<Instruction>(CorrespondingVal)->getParent();
+    NewPN->setIncomingBlock(Idx, BlockToUse);
+
+    // If we have an argument we make sure we replace using the argument from
+    // the correct function.
+    if (Argument *A = dyn_cast<Argument>(IncomingVal)) {
+      Value *Val = Group.OutlinedFunction->getArg(A->getArgNo());
+      NewPN->setIncomingValue(Idx, Val);
+      continue;
+    }
+    
+    // Find the corresponding value in the overall function.
+    IncomingVal = findOutputMapping(OutputMappings, IncomingVal);
+    Value *Val = Region.findCorrespondingValueIn(*FirstRegion, IncomingVal);
+    assert(Val && "Value is nullptr?");
+    NewPN->setIncomingValue(Idx, Val);
+  }
+  return NewPN;
+}
+
 // Within an extracted function, replace the argument uses of the extracted
 // region with the arguments of the function for an OutlinableGroup.
 //
@@ -1077,6 +1546,7 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
 static void
 replaceArgumentUses(OutlinableRegion &Region,
                     DenseMap<Value *, BasicBlock *> &OutputBBs,
+                    const DenseMap<Value *, Value *> &OutputMappings,
                     bool FirstFunction = false) {
   OutlinableGroup &Group = *Region.Parent;
   assert(Region.ExtractedFunction && "Region has no extracted function?");
@@ -1146,12 +1616,46 @@ replaceArgumentUses(OutlinableRegion &Region,
       LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
                         << *OutputBB << "\n");
 
-      if (FirstFunction)
+      // If this is storing a PHINode, we must make sure it is included in the
+      // overall function.
+      if (!isa<PHINode>(ValueOperand)) {
+        if (FirstFunction)
+          continue;
+        Value *CorrVal =
+            Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand);
+        assert(CorrVal && "Value is nullptr?");
+        NewI->setOperand(0, CorrVal);
+        continue;
+      }
+      PHINode *PN = cast<PHINode>(SI->getValueOperand());
+      // If it has a value, it was not split by the code extractor, which
+      // is what we are looking for.
+      if (Region.Candidate->getGVN(PN).hasValue())
         continue;
-      Value *CorrVal =
-          Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand);
-      assert(CorrVal && "Value is nullptr?");
-      NewI->setOperand(0, CorrVal);
+
+      // We record the parent block for the PHINode in the Region so that
+      // we can exclude it from checks later on.
+      Region.PHIBlocks.insert(std::make_pair(RetVal, PN->getParent()));
+
+      // If this is the first function, we do not need to worry about mergiing
+      // this with any other block in the overall outlined function, so we can
+      // just continue.
+      if (FirstFunction) {
+        BasicBlock *PHIBlock = PN->getParent();
+        Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock));
+        continue;
+      }
+
+      // We look for the aggregate block that contains the PHINodes leading into
+      // this exit path. If we can't find one, we create one.
+      BasicBlock *OverallPhiBlock = findOrCreatePHIBlock(Group, RetVal);
+
+      // For our PHINode, we find the combined canonical numbering, and
+      // attempt to find a matching PHINode in the overall PHIBlock.  If we
+      // cannot, we copy the PHINode and move it into this new block.
+      PHINode *NewPN =
+          findOrCreatePHIInBlock(*PN, Region, OverallPhiBlock, OutputMappings);
+      NewI->setOperand(0, NewPN);
     }
 
     // If we added an edge for basic blocks without a predecessor, we remove it
@@ -1392,7 +1896,12 @@ void createSwitchStatement(
     Module &M, OutlinableGroup &OG, DenseMap<Value *, BasicBlock *> &EndBBs,
     std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
   // We only need the switch statement if there is more than one store
-  // combination.
+  // combination, or there is more than one set of output blocks.  The first
+  // will occur when we store 
diff erent sets of values for two 
diff erent
+  // regions.  The second will occur when we have two outputs that are combined
+  // in a PHINode outside of the region in one outlined instance, and are used
+  // seaparately in another. This will create the same set of OutputGVNs, but
+  // will generate two 
diff erent output schemes.
   if (OG.OutputGVNCombinations.size() > 1) {
     Function *AggFunc = OG.OutlinedFunction;
     // Create a final block for each 
diff erent return block.
@@ -1435,8 +1944,14 @@ void createSwitchStatement(
     return;
   }
 
+  assert(OutputStoreBBs.size() < 2 && "Different store sets not handled!");
+
   // If there needs to be stores, move them from the output blocks to their
-  // corresponding ending block.
+  // corresponding ending block.  We do not check that the OutputGVNCombinations
+  // is equal to 1 here since that could just been the case where there are 0
+  // outputs. Instead, we check whether there is more than one set of output
+  // blocks since this is the only case where we would have to move the
+  // stores, and erase the extraneous blocks.
   if (OutputStoreBBs.size() == 1) {
     LLVM_DEBUG(dbgs() << "Move store instructions to the end block in "
                       << *OG.OutlinedFunction << "\n");
@@ -1468,10 +1983,13 @@ void createSwitchStatement(
 /// set of stores needed for the 
diff erent functions.
 /// \param [in,out] FuncsToRemove - Extracted functions to erase from module
 /// once outlining is complete.
+/// \param [in] OutputMappings - Extracted functions to erase from module
+/// once outlining is complete.
 static void fillOverallFunction(
     Module &M, OutlinableGroup &CurrentGroup,
     std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs,
-    std::vector<Function *> &FuncsToRemove) {
+    std::vector<Function *> &FuncsToRemove,
+    const DenseMap<Value *, Value *> &OutputMappings) {
   OutlinableRegion *CurrentOS = CurrentGroup.Regions[0];
 
   // Move first extracted function's instructions into new function.
@@ -1491,7 +2009,7 @@ static void fillOverallFunction(
                              CurrentGroup.OutlinedFunction, "output_block_0");
   CurrentOS->OutputBlockNum = 0;
 
-  replaceArgumentUses(*CurrentOS, NewBBs, true);
+  replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings, true);
   replaceConstants(*CurrentOS);
 
   // We first identify if any output blocks are empty, if they are we remove
@@ -1525,7 +2043,8 @@ void IROutliner::deduplicateExtractedSections(
 
   OutlinableRegion *CurrentOS;
 
-  fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove);
+  fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove,
+                      OutputMappings);
 
   std::vector<Value *> SortedKeys;
   for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) {
@@ -1539,8 +2058,7 @@ void IROutliner::deduplicateExtractedSections(
     createAndInsertBasicBlocks(
         CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction,
         "output_block_" + Twine(static_cast<unsigned>(Idx)));
-
-    replaceArgumentUses(*CurrentOS, NewBBs);
+    replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings);
     alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs,
                                 CurrentGroup.EndBBs, OutputMappings,
                                 OutputStoreBBs);
@@ -1708,6 +2226,34 @@ IROutliner::findBenefitFromAllRegions(OutlinableGroup &CurrentGroup) {
   return RegionBenefit;
 }
 
+/// For the \p OutputCanon number passed in find the value represented by this
+/// canonical number. If it is from a PHINode, we pick the first incoming
+/// value and return that Value instead.
+///
+/// \param Region - The OutlinableRegion to get the Value from.
+/// \param OutputCanon - The canonical number to find the Value from.
+/// \returns The Value represented by a canonical number \p OutputCanon in \p
+/// Region.
+static Value *findOutputValueInRegion(OutlinableRegion &Region,
+                                      unsigned OutputCanon) {
+  OutlinableGroup &CurrentGroup = *Region.Parent;
+  // If the value is greater than the value in the tracker, we have a
+  // PHINode and will instead use one of the incoming values to find the
+  // type.
+  if (OutputCanon > CurrentGroup.PHINodeGVNTracker) {
+    auto It = CurrentGroup.PHINodeGVNToGVNs.find(OutputCanon);
+    assert(It != CurrentGroup.PHINodeGVNToGVNs.end() &&
+           "Could not find GVN set for PHINode number!");
+    assert(It->second.second.size() > 0 && "PHINode does not have any values!");
+    OutputCanon = *It->second.second.begin();
+  }
+  Optional<unsigned> OGVN = Region.Candidate->fromCanonicalNum(OutputCanon);
+  assert(OGVN.hasValue() && "Could not find GVN for Canonical Number?");
+  Optional<Value *> OV = Region.Candidate->fromGVN(*OGVN);
+  assert(OV.hasValue() && "Could not find value for GVN?");
+  return *OV;
+}
+
 InstructionCost
 IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) {
   InstructionCost OverallCost = 0;
@@ -1715,10 +2261,8 @@ IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) {
     TargetTransformInfo &TTI = getTTI(*Region->StartBB->getParent());
 
     // Each output incurs a load after the call, so we add that to the cost.
-    for (unsigned OutputGVN : Region->GVNStores) {
-      Optional<Value *> OV = Region->Candidate->fromGVN(OutputGVN);
-      assert(OV.hasValue() && "Could not find value for GVN?");
-      Value *V = OV.getValue();
+    for (unsigned OutputCanon : Region->GVNStores) {
+      Value *V = findOutputValueInRegion(*Region, OutputCanon);
       InstructionCost LoadCost =
           TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
                               TargetTransformInfo::TCK_CodeSize);
@@ -1747,6 +2291,7 @@ static InstructionCost findCostForOutputBlocks(Module &M,
   InstructionCost OutputCost = 0;
   unsigned NumOutputBranches = 0;
 
+  OutlinableRegion &FirstRegion = *CurrentGroup.Regions[0];
   IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
   DenseSet<BasicBlock *> CandidateBlocks;
   Candidate.getBasicBlocks(CandidateBlocks);
@@ -1772,10 +2317,8 @@ static InstructionCost findCostForOutputBlocks(Module &M,
 
   for (const ArrayRef<unsigned> &OutputUse :
        CurrentGroup.OutputGVNCombinations) {
-    for (unsigned GVN : OutputUse) {
-      Optional<Value *> OV = Candidate.fromGVN(GVN);
-      assert(OV.hasValue() && "Could not find value for GVN?");
-      Value *V = OV.getValue();
+    for (unsigned OutputCanon : OutputUse) {
+      Value *V = findOutputValueInRegion(FirstRegion, OutputCanon);
       InstructionCost StoreCost =
           TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
                               TargetTransformInfo::TCK_CodeSize);
@@ -2035,8 +2578,8 @@ unsigned IROutliner::doOutline(Module &M) {
         continue;
 
       SmallVector<BasicBlock *> BE;
-      DenseSet<BasicBlock *> BBSet;
-      OS->Candidate->getBasicBlocks(BBSet, BE);
+      DenseSet<BasicBlock *> BlocksInRegion;
+      OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
                         false, "outlined");
@@ -2146,8 +2689,8 @@ unsigned IROutliner::doOutline(Module &M) {
     OutlinedRegions.clear();
     for (OutlinableRegion *OS : CurrentGroup.Regions) {
       SmallVector<BasicBlock *> BE;
-      DenseSet<BasicBlock *> BBSet;
-      OS->Candidate->getBasicBlocks(BBSet, BE);
+      DenseSet<BasicBlock *> BlocksInRegion;
+      OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
                         false, "outlined");

diff  --git a/llvm/test/Transforms/IROutliner/gvn-output-set-overload.ll b/llvm/test/Transforms/IROutliner/gvn-output-set-overload.ll
new file mode 100644
index 0000000000000..494bb89008507
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/gvn-output-set-overload.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we do 
diff erentiate between outputs of the region stored in PHINodes
+; versus those stored outside of PHINodes.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+next:
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  ret void
+next:
+  %1 = add i32 %c, 1
+  %2 = add i32 %e, 1
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32* null, i32 0)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[E_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[C_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[E_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[C_LOC]], i32* [[E_LOC]], i32 1)
+; CHECK-NEXT:    [[C_RELOAD:%.*]] = load i32, i32* [[C_LOC]], align 4
+; CHECK-NEXT:    [[E_RELOAD:%.*]] = load i32, i32* [[E_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[C_RELOAD]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[E_RELOAD]], 1
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT:%.*]], label [[TEST:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[NEXT_EXITSTUB:%.*]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP3:%.*]], label [[FINAL_BLOCK_1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_1:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP3]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_1:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[C]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_1_1:
+; CHECK-NEXT:    store i32 [[C]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       final_block_1:
+; CHECK-NEXT:    ret i1 true
+;

diff  --git a/llvm/test/Transforms/IROutliner/mismatched-phi-exits-not-in-first-outlined.ll b/llvm/test/Transforms/IROutliner/mismatched-phi-exits-not-in-first-outlined.ll
new file mode 100644
index 0000000000000..ef003bad6c0c6
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/mismatched-phi-exits-not-in-first-outlined.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we are able to extract blocks that contain PHINodes, and selectively
+; store into it's respective block, creating a new block if needed.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* null, i32 -1)
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 0)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal void @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[PHI_BLOCK:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[PHI_BLOCK]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP2:%.*]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[TMP3:%.*]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       phi_block:
+; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret void
+;

diff  --git a/llvm/test/Transforms/IROutliner/mismatched-phi-exits.ll b/llvm/test/Transforms/IROutliner/mismatched-phi-exits.ll
new file mode 100644
index 0000000000000..e51c281bab0c9
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/mismatched-phi-exits.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we are able to extract blocks that contain PHINodes, and selectively
+; store into it's respective block, only using if needed.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br label %first
+test:
+  %d = load i32, i32* %0, align 4
+  br label %first
+first:
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 0)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* null, i32 -1)
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal void  @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST_SPLIT:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST_SPLIT]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP2:%.*]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_0:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret void
+;

diff  --git a/llvm/test/Transforms/IROutliner/mismatched-phi-outputs-ordering.ll b/llvm/test/Transforms/IROutliner/mismatched-phi-outputs-ordering.ll
new file mode 100644
index 0000000000000..4f81747d04bb6
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/mismatched-phi-outputs-ordering.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we do not extract similar regions that would involve the splitting
+; of phi nodes on exit.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+next:
+  %2 = add i32 %d, 1
+  %3 = add i32 %e, 1
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  ret void
+next:
+  %1 = add i32 %d, 1
+  %2 = add i32 %e, 1
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[D_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[E_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[E_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[D_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[LT_CAST2:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[E_LOC]], i32* [[D_LOC]], i32* [[DOTCE_LOC]], i32 0)
+; CHECK-NEXT:    [[E_RELOAD:%.*]] = load i32, i32* [[E_LOC]], align 4
+; CHECK-NEXT:    [[D_RELOAD:%.*]] = load i32, i32* [[D_LOC]], align 4
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST2]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[D_RELOAD]], i32 [[E_RELOAD]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[D_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[E_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[E_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[D_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[E_LOC]], i32* [[D_LOC]], i32* null, i32 1)
+; CHECK-NEXT:    [[E_RELOAD:%.*]] = load i32, i32* [[E_LOC]], align 4
+; CHECK-NEXT:    [[D_RELOAD:%.*]] = load i32, i32* [[D_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    call void @outlined_ir_func_1(i32 [[D_RELOAD]], i32 [[E_RELOAD]])
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT:%.*]], label [[TEST:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[NEXT_EXITSTUB:%.*]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP4:%.*]], label [[FINAL_BLOCK_1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_1:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP4]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_0:%.*]]
+; CHECK-NEXT:    i32 1, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_0:
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    store i32 [[D]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_0_1:
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[D]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       output_block_1_1:
+; CHECK-NEXT:    store i32 [[E]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       final_block_1:
+; CHECK-NEXT:    ret i1 true
+;
+;
+; CHECK-LABEL: @outlined_ir_func_1(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[NEXT_TO_OUTLINE:%.*]]
+; CHECK:       next_to_outline:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP0:%.*]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1:%.*]], 1
+; CHECK-NEXT:    br label [[NEXT_AFTER_OUTLINE_EXITSTUB:%.*]]
+; CHECK:       next_after_outline.exitStub:
+; CHECK-NEXT:    ret void
+;

diff  --git a/llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll b/llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll
new file mode 100644
index 0000000000000..4e777e862543e
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/outlining-branches-phi-nodes.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Here we have multiple exits, but the 
diff erent sources, same outputs are
+; needed, this checks that they are compressed, and moved into the appropriate
+; output blocks.
+
+define void @outline_outputs1() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_6
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_6
+block_6:
+  %
diff  = phi i32 [%aval, %block_4], [%a2val, %block_5]
+  ret void
+}
+
+define void @outline_outputs2() #0 {
+entry:
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  %output2 = alloca i32, align 4
+  %result2 = alloca i32, align 4
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  br label %block_2
+block_1:
+  %a2 = alloca i32, align 4
+  %b2 = alloca i32, align 4
+  br label %block_2
+block_2:
+  %a2val = load i32, i32* %a
+  %b2val = load i32, i32* %b
+  %add2 = add i32 2, %a2val
+  %mul2 = mul i32 2, %b2val
+  br label %block_5
+block_3:
+  %aval = load i32, i32* %a
+  %bval = load i32, i32* %b
+  %add = add i32 2, %aval
+  %mul = mul i32 2, %bval
+  br label %block_4
+block_4:
+  store i32 %add, i32* %output, align 4
+  store i32 %mul, i32* %result, align 4
+  br label %block_6
+block_5:
+  store i32 %add2, i32* %output, align 4
+  store i32 %mul2, i32* %result, align 4
+  br label %block_6
+block_6:
+  %
diff  = phi i32 [%aval, %block_4], [%a2val, %block_5]
+  ret void
+}
+
+; CHECK-LABEL: @outline_outputs1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIFF_CE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DIFF_CE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[DIFF_CE_LOC]])
+; CHECK-NEXT:    [[DIFF_CE_RELOAD:%.*]] = load i32, i32* [[DIFF_CE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = phi i32 [ [[DIFF_CE_RELOAD]], [[BLOCK_2]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @outline_outputs2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIFF_CE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2:%.*]]
+; CHECK:       block_1:
+; CHECK-NEXT:    [[A2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B2:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    br label [[BLOCK_2]]
+; CHECK:       block_2:
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DIFF_CE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[RESULT]], i32* [[DIFF_CE_LOC]])
+; CHECK-NEXT:    [[DIFF_CE_RELOAD:%.*]] = load i32, i32* [[DIFF_CE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br label [[BLOCK_6:%.*]]
+; CHECK:       block_6:
+; CHECK-NEXT:    [[DIFF:%.*]] = phi i32 [ [[DIFF_CE_RELOAD]], [[BLOCK_2]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK:  define internal void @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[BLOCK_2_TO_OUTLINE:%.*]]
+; CHECK:       block_2_to_outline:
+; CHECK-NEXT:    [[A2VAL:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[B2VAL:%.*]] = load i32, i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 2, [[A2VAL]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 2, [[B2VAL]]
+; CHECK-NEXT:    br label [[BLOCK_5:%.*]]
+; CHECK:       block_3:
+; CHECK-NEXT:    [[AVAL:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[BVAL:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 2, [[AVAL]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 2, [[BVAL]]
+; CHECK-NEXT:    br label [[BLOCK_4:%.*]]
+; CHECK:       block_4:
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[TMP3:%.*]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_SPLIT:%.*]]
+; CHECK:       block_5:
+; CHECK-NEXT:    store i32 [[ADD2]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[MUL2]], i32* [[TMP3]], align 4
+; CHECK-NEXT:    br label [[BLOCK_6_SPLIT]]
+; CHECK:       block_6.split:
+; CHECK-NEXT:    [[DIFF_CE:%.*]] = phi i32 [ [[AVAL]], [[BLOCK_4]] ], [ [[A2VAL]], [[BLOCK_5]] ]
+; CHECK-NEXT:    br label [[BLOCK_6_EXITSTUB:%.*]]
+; CHECK:       block_6.exitStub:
+; CHECK-NEXT:    store i32 [[DIFF_CE]], i32* [[TMP4:%.*]], align 4
+; CHECK-NEXT:    ret void
+;

diff  --git a/llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll b/llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll
index 00c99488e231d..77cb4a5bc476f 100644
--- a/llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-exits-to-phi-node.ll
@@ -37,42 +37,50 @@ first:
 }
 ; CHECK-LABEL: @function1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
-; CHECK-NEXT:    br label [[TEST1:%.*]]
-; CHECK:       test1:
-; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]])
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
 ; CHECK-NEXT:    br label [[FIRST:%.*]]
-; CHECK:       test:
-; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]])
-; CHECK-NEXT:    br label [[FIRST]]
 ; CHECK:       first:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: @function2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
-; CHECK-NEXT:    br label [[TEST1:%.*]]
-; CHECK:       test1:
-; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]])
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
 ; CHECK-NEXT:    br label [[FIRST:%.*]]
-; CHECK:       test:
-; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[TMP0]])
-; CHECK-NEXT:    br label [[FIRST]]
 ; CHECK:       first:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: define internal void @outlined_ir_func_0(
+; CHECK-LABEL: define internal void @outlined_ir_func_0(
 ; CHECK-NEXT:  newFuncRoot:
-; CHECK-NEXT:    br label [[TEST_TO_OUTLINE:%.*]]
-; CHECK:       test_to_outline:
-; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST_SPLIT:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br label [[FIRST_SPLIT]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
 ; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
 ; CHECK:       first.exitStub:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/IROutliner/phi-nodes-output-overload.ll b/llvm/test/Transforms/IROutliner/phi-nodes-output-overload.ll
new file mode 100644
index 0000000000000..ecc1c36d6b9e3
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/phi-nodes-output-overload.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we do not extract similar regions that would involve the splitting
+; of phi nodes on exit.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+next:
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  br label %test1
+test1:
+  %e = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  ret void
+next:
+  %1 = phi i32 [ %c, %test ], [ %e, %test1 ]
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 0)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32* [[DOTCE_LOC]], i32 1)
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    br label [[TEST1:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT:%.*]], label [[PHI_BLOCK:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[PHI_BLOCK]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[C]], [[TEST:%.*]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP2:%.*]], label [[FINAL_BLOCK_1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_0_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    switch i32 [[TMP2]], label [[FINAL_BLOCK_0:%.*]] [
+; CHECK-NEXT:    i32 0, label [[OUTPUT_BLOCK_1_0:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       output_block_0_1:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP1:%.*]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_1]]
+; CHECK:       output_block_1_0:
+; CHECK-NEXT:    store i32 [[TMP3:%.*]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    br label [[FINAL_BLOCK_0]]
+; CHECK:       phi_block:
+; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[C]], [[TEST]] ], [ [[E]], [[TEST1]] ]
+; CHECK-NEXT:    br label [[NEXT_EXITSTUB:%.*]]
+; CHECK:       final_block_0:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       final_block_1:
+; CHECK-NEXT:    ret i1 true
+;

diff  --git a/llvm/test/Transforms/IROutliner/region-inputs-in-phi-nodes.ll b/llvm/test/Transforms/IROutliner/region-inputs-in-phi-nodes.ll
new file mode 100644
index 0000000000000..258bbfe131e3e
--- /dev/null
+++ b/llvm/test/Transforms/IROutliner/region-inputs-in-phi-nodes.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; Show that we are able to propogate inputs to the region into the split PHINode
+; outside of the region if necessary.
+
+define void @function1(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  %z = add i32 %c, %c
+  br i1 true, label %test1, label %first
+test1:
+  %e = load i32, i32* %0, align 4
+  %1 = add i32 %c, %c
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %2 = phi i32 [ %d, %test ], [ %e, %test1 ], [ %c, %entry ]
+  ret void
+next:
+  ret void
+}
+
+define void @function2(i32* %a, i32* %b) {
+entry:
+  %0 = alloca i32, align 4
+  %c = load i32, i32* %0, align 4
+  %z = mul i32 %c, %c
+  br i1 true, label %test1, label %first
+test1:
+  %e = load i32, i32* %0, align 4
+  %1 = add i32 %c, %c
+  br i1 true, label %first, label %test
+test:
+  %d = load i32, i32* %0, align 4
+  br i1 true, label %first, label %next
+first:
+  %2 = phi i32 [ %d, %test ], [ %e, %test1 ], [ %c, %entry ]
+  ret void
+next:
+  ret void
+}
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32 [[C]], i32* [[DOTCE_LOC]])
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TARGETBLOCK]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DOTCE_LOC:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    [[Z:%.*]] = mul i32 [[C]], [[C]]
+; CHECK-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[DOTCE_LOC]] to i8*
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    [[TARGETBLOCK:%.*]] = call i1 @outlined_ir_func_0(i32* [[TMP0]], i32 [[C]], i32* [[DOTCE_LOC]])
+; CHECK-NEXT:    [[DOTCE_RELOAD:%.*]] = load i32, i32* [[DOTCE_LOC]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; CHECK-NEXT:    br i1 [[TARGETBLOCK]], label [[FIRST:%.*]], label [[NEXT:%.*]]
+; CHECK:       first:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[DOTCE_RELOAD]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       next:
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal i1 @outlined_ir_func_0(
+; CHECK-NEXT:  newFuncRoot:
+; CHECK-NEXT:    br label [[ENTRY_TO_OUTLINE:%.*]]
+; CHECK:       entry_to_outline:
+; CHECK-NEXT:    br i1 true, label [[TEST1:%.*]], label [[FIRST_SPLIT:%.*]]
+; CHECK:       test1:
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[TMP1:%.*]], [[TMP1]]
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[TEST:%.*]]
+; CHECK:       test:
+; CHECK-NEXT:    [[D:%.*]] = load i32, i32* [[TMP0]], align 4
+; CHECK-NEXT:    br i1 true, label [[FIRST_SPLIT]], label [[NEXT_EXITSTUB:%.*]]
+; CHECK:       first.split:
+; CHECK-NEXT:    [[DOTCE:%.*]] = phi i32 [ [[D]], [[TEST]] ], [ [[E]], [[TEST1]] ], [ [[TMP1]], [[ENTRY_TO_OUTLINE]] ]
+; CHECK-NEXT:    br label [[FIRST_EXITSTUB:%.*]]
+; CHECK:       first.exitStub:
+; CHECK-NEXT:    store i32 [[DOTCE]], i32* [[TMP2:%.*]], align 4
+; CHECK-NEXT:    ret i1 true
+; CHECK:       next.exitStub:
+; CHECK-NEXT:    ret i1 false
+;


        


More information about the llvm-commits mailing list