[llvm] r373018 - [LoopFusion] Add ability to fuse guarded loops

Thu Sep 26 14:42:45 PDT 2019

Author: kbarton
Date: Thu Sep 26 14:42:45 2019
New Revision: 373018

URL: http://llvm.org/viewvc/llvm-project?rev=373018&view=rev
Log:
[LoopFusion] Add ability to fuse guarded loops

Summary:
This patch extends the current capabilities in loop fusion to fuse guarded loops
(as defined in https://reviews.llvm.org/D63885). The patch adds the necessary
safety checks to ensure that it safe to fuse the guarded loops (control flow
equivalent, no intervening code, and same guard conditions). It also provides an
alternative method to perform the actual fusion of guarded loops. The mechanics
to fuse guarded loops are slightly different then fusing non-guarded loops, so I
opted to keep them separate methods. I will be cleaning this up in later
patches, and hope to converge on a single method to fuse both guarded and
non-guarded loops, but for now I think the review will be easier to keep them
separate.

Reviewers: jdoerfert, Meinersbur, dmgreen, etiotto, Whitney

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D65464

Added:
    llvm/trunk/test/Transforms/LoopFusion/guarded.ll
Modified:
    llvm/trunk/lib/Transforms/Scalar/LoopFuse.cpp

Modified: llvm/trunk/lib/Transforms/Scalar/LoopFuse.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopFuse.cpp?rev=373018&r1=373017&r2=373018&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Scalar/LoopFuse.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/LoopFuse.cpp Thu Sep 26 14:42:45 2019
@@ -85,6 +85,9 @@ STATISTIC(NonEqualTripCount, "Loop trip
 STATISTIC(NonAdjacent, "Loops are not adjacent");
 STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader");
 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
+STATISTIC(NonIdenticalGuards, "Candidates have different guards");
+STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block");
+STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block");
 
 enum FusionDependenceAnalysisChoice {
   FUSION_DEPENDENCE_ANALYSIS_SCEV,
@@ -144,6 +147,8 @@ struct FusionCandidate {
   SmallVector<Instruction *, 16> MemWrites;
   /// Are all of the members of this fusion candidate still valid
   bool Valid;
+  /// Guard branch of the loop, if it exists
+  BranchInst *GuardBranch;
 
   /// Dominator and PostDominator trees are needed for the
   /// FusionCandidateCompare function, required by FusionCandidateSet to
@@ -158,8 +163,14 @@ struct FusionCandidate {
                   const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE)
       : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
         ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
-        Latch(L->getLoopLatch()), L(L), Valid(true), DT(DT), PDT(PDT),
-        ORE(ORE) {
+        Latch(L->getLoopLatch()), L(L), Valid(true), GuardBranch(nullptr),
+        DT(DT), PDT(PDT), ORE(ORE) {
+
+    // TODO: This is temporary while we fuse both rotated and non-rotated
+    // loops. Once we switch to only fusing rotated loops, the initialization of
+    // GuardBranch can be moved into the initialization list above.
+    if (isRotated())
+      GuardBranch = L->getLoopGuardBranch();
 
     // Walk over all blocks in the loop and check for conditions that may
     // prevent fusion. For each block, walk over all instructions and collect
@@ -218,16 +229,55 @@ struct FusionCandidate {
     assert(Latch == L->getLoopLatch() && "Latch is out of sync");
   }
 
+  /// Get the entry block for this fusion candidate.
+  ///
+  /// If this fusion candidate represents a guarded loop, the entry block is the
+  /// loop guard block. If it represents an unguarded loop, the entry block is
+  /// the preheader of the loop.
+  BasicBlock *getEntryBlock() const {
+    if (GuardBranch)
+      return GuardBranch->getParent();
+    else
+      return Preheader;
+  }
+
+  /// Given a guarded loop, get the successor of the guard that is not in the
+  /// loop.
+  ///
+  /// This method returns the successor of the loop guard that is not located
+  /// within the loop (i.e., the successor of the guard that is not the
+  /// preheader).
+  /// This method is only valid for guarded loops.
+  BasicBlock *getNonLoopBlock() const {
+    assert(GuardBranch && "Only valid on guarded loops.");
+    assert(GuardBranch->isConditional() &&
+           "Expecting guard to be a conditional branch.");
+    return (GuardBranch->getSuccessor(0) == Preheader)
+               ? GuardBranch->getSuccessor(1)
+               : GuardBranch->getSuccessor(0);
+  }
+
+  bool isRotated() const {
+    assert(L && "Expecting loop to be valid.");
+    assert(Latch && "Expecting latch to be valid.");
+    return L->isLoopExiting(Latch);
+  }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   LLVM_DUMP_METHOD void dump() const {
-    dbgs() << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
+    dbgs() << "\tGuardBranch: "
+           << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
+           << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
            << "\n"
            << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
            << "\tExitingBB: "
            << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
            << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
            << "\n"
-           << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n";
+           << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
+           << "\tEntryBlock: "
+           << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
+           << "\n";
   }
 #endif
 
@@ -303,21 +353,24 @@ struct FusionCandidateCompare {
                   const FusionCandidate &RHS) const {
     const DominatorTree *DT = LHS.DT;
 
+    BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
+    BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
+
     // Do not save PDT to local variable as it is only used in asserts and thus
     // will trigger an unused variable warning if building without asserts.
     assert(DT && LHS.PDT && "Expecting valid dominator tree");
 
     // Do this compare first so if LHS == RHS, function returns false.
-    if (DT->dominates(RHS.Preheader, LHS.Preheader)) {
+    if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
       // RHS dominates LHS
       // Verify LHS post-dominates RHS
-      assert(LHS.PDT->dominates(LHS.Preheader, RHS.Preheader));
+      assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
       return false;
     }
 
-    if (DT->dominates(LHS.Preheader, RHS.Preheader)) {
+    if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
       // Verify RHS Postdominates LHS
-      assert(LHS.PDT->dominates(RHS.Preheader, LHS.Preheader));
+      assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
       return true;
     }
 
@@ -538,11 +591,14 @@ private:
                                const FusionCandidate &FC1) const {
     assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
 
-    if (DT.dominates(FC0.Preheader, FC1.Preheader))
-      return PDT.dominates(FC1.Preheader, FC0.Preheader);
+    BasicBlock *FC0EntryBlock = FC0.getEntryBlock();
+    BasicBlock *FC1EntryBlock = FC1.getEntryBlock();
+
+    if (DT.dominates(FC0EntryBlock, FC1EntryBlock))
+      return PDT.dominates(FC1EntryBlock, FC0EntryBlock);
 
-    if (DT.dominates(FC1.Preheader, FC0.Preheader))
-      return PDT.dominates(FC0.Preheader, FC1.Preheader);
+    if (DT.dominates(FC1EntryBlock, FC0EntryBlock))
+      return PDT.dominates(FC0EntryBlock, FC1EntryBlock);
 
     return false;
   }
@@ -677,11 +733,22 @@ private:
             continue;
           }
 
-          // For now we skip fusing if the second candidate has any instructions
-          // in the preheader. This is done because we currently do not have the
-          // safety checks to determine if it is save to move the preheader of
-          // the second candidate past the body of the first candidate. Once
-          // these checks are added, this condition can be removed.
+          // Ensure that FC0 and FC1 have identical guards.
+          // If one (or both) are not guarded, this check is not necessary.
+          if (FC0->GuardBranch && FC1->GuardBranch &&
+              !haveIdenticalGuards(*FC0, *FC1)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
+                                 "guards. Not Fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonIdenticalGuards);
+            continue;
+          }
+
+          // The following three checks look for empty blocks in FC0 and FC1. If
+          // any of these blocks are non-empty, we do not fuse. This is done
+          // because we currently do not have the safety checks to determine if
+          // it is safe to move the blocks past other blocks in the loop. Once
+          // these checks are added, these conditions can be relaxed.
           if (!isEmptyPreheader(*FC1)) {
             LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
                                  "preheader. Not fusing.\n");
@@ -690,6 +757,24 @@ private:
             continue;
           }
 
+          if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit "
+                                 "block. Not fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEmptyExitBlock);
+            continue;
+          }
+
+          if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard "
+                                 "block. Not fusing.\n");
+            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                       NonEmptyGuardBlock);
+            continue;
+          }
+
+          // Check the dependencies across the loops and do not fuse if it would
+          // violate them.
           if (!dependencesAllowFusion(*FC0, *FC1)) {
             LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
@@ -895,7 +980,7 @@ private:
     LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
                       << "\n");
     assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
-    assert(DT.dominates(FC0.Preheader, FC1.Preheader));
+    assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
 
     for (Instruction *WriteL0 : FC0.MemWrites) {
       for (Instruction *WriteL1 : FC1.MemWrites)
@@ -945,18 +1030,89 @@ private:
     return true;
   }
 
-  /// Determine if the exit block of \p FC0 is the preheader of \p FC1. In this
-  /// case, there is no code in between the two fusion candidates, thus making
-  /// them adjacent.
+  /// Determine if two fusion candidates are adjacent in the CFG.
+  ///
+  /// This method will determine if there are additional basic blocks in the CFG
+  /// between the exit of \p FC0 and the entry of \p FC1.
+  /// If the two candidates are guarded loops, then it checks whether the
+  /// non-loop successor of the \p FC0 guard branch is the entry block of \p
+  /// FC1. If not, then the loops are not adjacent. If the two candidates are
+  /// not guarded loops, then it checks whether the exit block of \p FC0 is the
+  /// preheader of \p FC1.
   bool isAdjacent(const FusionCandidate &FC0,
                   const FusionCandidate &FC1) const {
-    return FC0.ExitBlock == FC1.Preheader;
+    // If the successor of the guard branch is FC1, then the loops are adjacent
+    if (FC0.GuardBranch)
+      return FC0.getNonLoopBlock() == FC1.getEntryBlock();
+    else
+      return FC0.ExitBlock == FC1.getEntryBlock();
+  }
+
+  /// Determine if two fusion candidates have identical guards
+  ///
+  /// This method will determine if two fusion candidates have the same guards.
+  /// The guards are considered the same if:
+  ///   1. The instructions to compute the condition used in the compare are
+  ///      identical.
+  ///   2. The successors of the guard have the same flow into/around the loop.
+  /// If the compare instructions are identical, then the first successor of the
+  /// guard must go to the same place (either the preheader of the loop or the
+  /// NonLoopBlock). In other words, the the first successor of both loops must
+  /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
+  /// the NonLoopBlock). The same must be true for the second successor.
+  bool haveIdenticalGuards(const FusionCandidate &FC0,
+                           const FusionCandidate &FC1) const {
+    assert(FC0.GuardBranch && FC1.GuardBranch &&
+           "Expecting FC0 and FC1 to be guarded loops.");
+
+    if (auto FC0CmpInst =
+            dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
+      if (auto FC1CmpInst =
+              dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
+        if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
+          return false;
+
+    // The compare instructions are identical.
+    // Now make sure the successor of the guards have the same flow into/around
+    // the loop
+    if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
+      return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
+    else
+      return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
+  }
+
+  /// Check that the guard for \p FC *only* contains the cmp/branch for the
+  /// guard.
+  /// Once we are able to handle intervening code, any code in the guard block
+  /// for FC1 will need to be treated as intervening code and checked whether
+  /// it can safely move around the loops.
+  bool isEmptyGuardBlock(const FusionCandidate &FC) const {
+    assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch.");
+    if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) {
+      auto *GuardBlock = FC.GuardBranch->getParent();
+      // If the generation of the cmp value is in GuardBlock, then the size of
+      // the guard block should be 2 (cmp + branch). If the generation of the
+      // cmp value is in a different block, then the size of the guard block
+      // should only be 1.
+      if (CmpInst->getParent() == GuardBlock)
+        return GuardBlock->size() == 2;
+      else
+        return GuardBlock->size() == 1;
+    }
+
+    return false;
   }
 
   bool isEmptyPreheader(const FusionCandidate &FC) const {
+    assert(FC.Preheader && "Expecting a valid preheader");
     return FC.Preheader->size() == 1;
   }
 
+  bool isEmptyExitBlock(const FusionCandidate &FC) const {
+    assert(FC.ExitBlock && "Expecting a valid exit block");
+    return FC.ExitBlock->size() == 1;
+  }
+
   /// Fuse two fusion candidates, creating a new fused loop.
   ///
   /// This method contains the mechanics of fusing two loops, represented by \p
@@ -993,6 +1149,12 @@ private:
     LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
                dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
 
+    // Fusing guarded loops is handled slightly differently than non-guarded
+    // loops and has been broken out into a separate method instead of trying to
+    // intersperse the logic within a single method.
+    if (FC0.GuardBranch)
+      return fuseGuardedLoops(FC0, FC1);
+
     assert(FC1.Preheader == FC0.ExitBlock);
     assert(FC1.Preheader->size() == 1 &&
            FC1.Preheader->getSingleSuccessor() == FC1.Header);
@@ -1137,8 +1299,6 @@ private:
     SE.verify();
 #endif
 
-    FuseCounter++;
-
     LLVM_DEBUG(dbgs() << "Fusion done:\n");
 
     return FC0.L;
@@ -1170,6 +1330,232 @@ private:
              << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
              << ": " << Stat.getDesc());
   }
+
+  /// Fuse two guarded fusion candidates, creating a new fused loop.
+  ///
+  /// Fusing guarded loops is handled much the same way as fusing non-guarded
+  /// loops. The rewiring of the CFG is slightly different though, because of
+  /// the presence of the guards around the loops and the exit blocks after the
+  /// loop body. As such, the new loop is rewired as follows:
+  ///    1. Keep the guard branch from FC0 and use the non-loop block target
+  /// from the FC1 guard branch.
+  ///    2. Remove the exit block from FC0 (this exit block should be empty
+  /// right now).
+  ///    3. Remove the guard branch for FC1
+  ///    4. Remove the preheader for FC1.
+  /// The exit block successor for the latch of FC0 is updated to be the header
+  /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
+  /// be the header of FC0, thus creating the fused loop.
+  Loop *fuseGuardedLoops(const FusionCandidate &FC0,
+                         const FusionCandidate &FC1) {
+    assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
+
+    BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
+    BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
+    BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
+    BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
+
+    assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
+
+    SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Update the Loop Guard
+    ////////////////////////////////////////////////////////////////////////////
+    // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
+    // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
+    // Thus, one path from the guard goes to the preheader for FC0 (and thus
+    // executes the new fused loop) and the other path goes to the NonLoopBlock
+    // for FC1 (where FC1 guard would have gone if FC1 was not executed).
+    FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
+    FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
+                                                      FC1.Header);
+
+    // The guard of FC1 is not necessary anymore.
+    FC1.GuardBranch->eraseFromParent();
+    new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
+
+    assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
+           "Expecting guard block to have no predecessors");
+    assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
+           "Expecting guard block to have no successors");
+
+    // Remember the phi nodes originally in the header of FC0 in order to rewire
+    // them later. However, this is only necessary if the new loop carried
+    // values might not dominate the exiting branch. While we do not generally
+    // test if this is the case but simply insert intermediate phi nodes, we
+    // need to make sure these intermediate phi nodes have different
+    // predecessors. To this end, we filter the special case where the exiting
+    // block is the latch block of the first loop. Nothing needs to be done
+    // anyway as all loop carried values dominate the latch and thereby also the
+    // exiting branch.
+    // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
+    // (because the loops are rotated. Thus, nothing will ever be added to
+    // OriginalFC0PHIs.
+    SmallVector<PHINode *, 8> OriginalFC0PHIs;
+    if (FC0.ExitingBlock != FC0.Latch)
+      for (PHINode &PHI : FC0.Header->phis())
+        OriginalFC0PHIs.push_back(&PHI);
+
+    assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
+
+    // Replace incoming blocks for header PHIs first.
+    FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
+    FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
+
+    // The old exiting block of the first loop (FC0) has to jump to the header
+    // of the second as we need to execute the code in the second header block
+    // regardless of the trip count. That is, if the trip count is 0, so the
+    // back edge is never taken, we still have to execute both loop headers,
+    // especially (but not only!) if the second is a do-while style loop.
+    // However, doing so might invalidate the phi nodes of the first loop as
+    // the new values do only need to dominate their latch and not the exiting
+    // predicate. To remedy this potential problem we always introduce phi
+    // nodes in the header of the second loop later that select the loop carried
+    // value, if the second header was reached through an old latch of the
+    // first, or undef otherwise. This is sound as exiting the first implies the
+    // second will exit too, __without__ taking the back-edge (their
+    // trip-counts are equal after all).
+    FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
+                                                         FC1.Header);
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+
+    // Remove FC0 Exit Block
+    // The exit block for FC0 is no longer needed since control will flow
+    // directly to the header of FC1. Since it is an empty block, it can be
+    // removed at this point.
+    // TODO: In the future, we can handle non-empty exit blocks my merging any
+    // instructions from FC0 exit block into FC1 exit block prior to removing
+    // the block.
+    assert(pred_begin(FC0.ExitBlock) == pred_end(FC0.ExitBlock) &&
+           "Expecting exit block to be empty");
+    FC0.ExitBlock->getTerminator()->eraseFromParent();
+    new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
+
+    // Remove FC1 Preheader
+    // The pre-header of L1 is not necessary anymore.
+    assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
+    FC1.Preheader->getTerminator()->eraseFromParent();
+    new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(
+        DominatorTree::Delete, FC1.Preheader, FC1.Header));
+
+    // Moves the phi nodes from the second to the first loops header block.
+    while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
+      if (SE.isSCEVable(PHI->getType()))
+        SE.forgetValue(PHI);
+      if (PHI->hasNUsesOrMore(1))
+        PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
+      else
+        PHI->eraseFromParent();
+    }
+
+    // Introduce new phi nodes in the second loop header to ensure
+    // exiting the first and jumping to the header of the second does not break
+    // the SSA property of the phis originally in the first loop. See also the
+    // comment above.
+    Instruction *L1HeaderIP = &FC1.Header->front();
+    for (PHINode *LCPHI : OriginalFC0PHIs) {
+      int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
+      assert(L1LatchBBIdx >= 0 &&
+             "Expected loop carried value to be rewired at this point!");
+
+      Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
+
+      PHINode *L1HeaderPHI = PHINode::Create(
+          LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
+      L1HeaderPHI->addIncoming(LCV, FC0.Latch);
+      L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
+                               FC0.ExitingBlock);
+
+      LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
+    }
+
+    // Update the latches
+
+    // Replace latch terminator destinations.
+    FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
+    FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
+
+    // If FC0.Latch and FC0.ExitingBlock are the same then we have already
+    // performed the updates above.
+    if (FC0.Latch != FC0.ExitingBlock)
+      TreeUpdates.emplace_back(DominatorTree::UpdateType(
+          DominatorTree::Insert, FC0.Latch, FC1.Header));
+
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+                                                       FC0.Latch, FC0.Header));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
+                                                       FC1.Latch, FC0.Header));
+    TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+                                                       FC1.Latch, FC1.Header));
+
+    // All done
+    // Apply the updates to the Dominator Tree and cleanup.
+
+    assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
+           "FC1GuardBlock has successors!!");
+    assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
+           "FC1GuardBlock has predecessors!!");
+
+    // Update DT/PDT
+    DTU.applyUpdates(TreeUpdates);
+
+    LI.removeBlock(FC1.Preheader);
+    DTU.deleteBB(FC1.Preheader);
+    DTU.deleteBB(FC0.ExitBlock);
+    DTU.flush();
+
+    // Is there a way to keep SE up-to-date so we don't need to forget the loops
+    // and rebuild the information in subsequent passes of fusion?
+    SE.forgetLoop(FC1.L);
+    SE.forgetLoop(FC0.L);
+
+    // Merge the loops.
+    SmallVector<BasicBlock *, 8> Blocks(FC1.L->block_begin(),
+                                        FC1.L->block_end());
+    for (BasicBlock *BB : Blocks) {
+      FC0.L->addBlockEntry(BB);
+      FC1.L->removeBlockFromLoop(BB);
+      if (LI.getLoopFor(BB) != FC1.L)
+        continue;
+      LI.changeLoopFor(BB, FC0.L);
+    }
+    while (!FC1.L->empty()) {
+      const auto &ChildLoopIt = FC1.L->begin();
+      Loop *ChildLoop = *ChildLoopIt;
+      FC1.L->removeChildLoop(ChildLoopIt);
+      FC0.L->addChildLoop(ChildLoop);
+    }
+
+    // Delete the now empty loop L1.
+    LI.erase(FC1.L);
+
+#ifndef NDEBUG
+    assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
+    assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+    assert(PDT.verify());
+    LI.verify(DT);
+    SE.verify();
+#endif
+
+    LLVM_DEBUG(dbgs() << "Fusion done:\n");
+
+    return FC0.L;
+  }
 };
 
 struct LoopFuseLegacy : public FunctionPass {

Added: llvm/trunk/test/Transforms/LoopFusion/guarded.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopFusion/guarded.ll?rev=373018&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopFusion/guarded.ll (added)
+++ llvm/trunk/test/Transforms/LoopFusion/guarded.ll Thu Sep 26 14:42:45 2019
@@ -0,0 +1,67 @@
+; RUN: opt -S -loop-fusion < %s | FileCheck %s
+
+ at B = common global [1024 x i32] zeroinitializer, align 16
+
+; CHECK: void @dep_free_parametric
+; CHECK-next: entry:
+; CHECK: br i1 %{{.*}}, label %[[LOOP1PREHEADER:bb[0-9]*]], label %[[LOOP1SUCC:bb[0-9]+]]
+; CHECK: [[LOOP1PREHEADER]]
+; CHECK-NEXT: br label %[[LOOP1BODY:bb[0-9]*]]
+; CHECK: [[LOOP1BODY]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP2BODY:bb[0-9]*]], label %[[LOOP2BODY]]
+; CHECK: [[LOOP2BODY]]
+; CHECK: br i1 %{{.*}}, label %[[LOOP1BODY]], label %[[LOOP2EXIT:bb[0-9]+]]
+; CHECK: [[LOOP2EXIT]]
+; CHECK: br label %[[LOOP1SUCC]]
+; CHECK: [[LOOP1SUCC]]
+; CHECK: ret void
+define void @dep_free_parametric(i32* noalias %A, i64 %N) {
+entry:
+  %cmp4 = icmp slt i64 0, %N
+  br i1 %cmp4, label %bb3, label %bb14
+
+bb3:                               ; preds = %entry
+  br label %bb5
+
+bb5:                                         ; preds = %bb3, %bb5
+  %i.05 = phi i64 [ %inc, %bb5 ], [ 0, %bb3 ]
+  %sub = sub nsw i64 %i.05, 3
+  %add = add nsw i64 %i.05, 3
+  %mul = mul nsw i64 %sub, %add
+  %rem = srem i64 %mul, %i.05
+  %conv = trunc i64 %rem to i32
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.05
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp slt i64 %inc, %N
+  br i1 %cmp, label %bb5, label %bb10
+
+bb10:                                 ; preds = %bb5
+  br label %bb14
+
+bb14:                                          ; preds = %bb10, %entry
+  %cmp31 = icmp slt i64 0, %N
+  br i1 %cmp31, label %bb8, label %bb12
+
+bb8:                              ; preds = %bb14
+  br label %bb9
+
+bb9:                                        ; preds = %bb8, %bb9
+  %i1.02 = phi i64 [ %inc14, %bb9 ], [ 0, %bb8 ]
+  %sub7 = sub nsw i64 %i1.02, 3
+  %add8 = add nsw i64 %i1.02, 3
+  %mul9 = mul nsw i64 %sub7, %add8
+  %rem10 = srem i64 %mul9, %i1.02
+  %conv11 = trunc i64 %rem10 to i32
+  %arrayidx12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %i1.02
+  store i32 %conv11, i32* %arrayidx12, align 4
+  %inc14 = add nsw i64 %i1.02, 1
+  %cmp3 = icmp slt i64 %inc14, %N
+  br i1 %cmp3, label %bb9, label %bb15
+
+bb15:                               ; preds = %bb9
+  br label %bb12
+
+bb12:                                        ; preds = %bb15, %bb14
+  ret void
+}