[llvm] [DependenceAnalysis] Extending SIV to handle separate loops (PR #128782)

Alireza Torabian via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 23 08:25:04 PDT 2025


https://github.com/1997alireza updated https://github.com/llvm/llvm-project/pull/128782

>From 442041600c17ee3b2b259e51a74dce9b2e3529a3 Mon Sep 17 00:00:00 2001
From: Alireza Torabian <alireza.torabian at huawei.com>
Date: Mon, 24 Feb 2025 11:53:53 -0500
Subject: [PATCH] [DependenceAnalysis][LoopFusion] Extending SIV to handle
 separate loops

When there is a dependency between two memory instructions in separate
loops, SIV will be able to test them and compute the direction and
the distance of the dependency. Loop fusion pass will uses this
information to detect loop-carried dependencies and fuse the loops
if it is legal.
---
 .../llvm/Analysis/DependenceAnalysis.h        | 193 ++++++---
 llvm/lib/Analysis/DependenceAnalysis.cpp      | 408 +++++++++++-------
 llvm/lib/Transforms/Scalar/LoopFuse.cpp       |  29 ++
 .../PreliminaryNoValidityCheckFixedSize.ll    |   2 +-
 .../DependenceAnalysis/SIVSeparateLoops.ll    | 145 +++++++
 .../LoopFusion/backward_loop_carried.ll       | 185 ++++++++
 llvm/test/Transforms/LoopFusion/simple.ll     |  28 +-
 7 files changed, 760 insertions(+), 230 deletions(-)
 create mode 100644 llvm/test/Analysis/DependenceAnalysis/SIVSeparateLoops.ll
 create mode 100644 llvm/test/Transforms/LoopFusion/backward_loop_carried.ll

diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h
index 426ac757b4b0d..1031e2d33196c 100644
--- a/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -152,13 +152,28 @@ namespace llvm {
     /// source and destination of the dependence.
     virtual unsigned getLevels() const { return 0; }
 
+    /// getSeparateLevels - Returns the number of separate loops surrounding
+    /// the source and destination of the dependence.
+    virtual unsigned getSeparateLevels() const { return 0; }
+
+    /// getDVEntry - Returns the DV entry associated with a regular or a
+    /// separate level
+    DVEntry getDVEntry(unsigned Level, bool Separate) const;
+
     /// getDirection - Returns the direction associated with a particular
-    /// level.
-    virtual unsigned getDirection(unsigned Level) const { return DVEntry::ALL; }
+    /// level. If Separate is set to true, information about a separate
+    /// level is provided.
+    virtual unsigned getDirection(unsigned Level, bool Separate = false) const {
+      return DVEntry::ALL;
+    }
 
     /// getDistance - Returns the distance (or NULL) associated with a
-    /// particular level.
-    virtual const SCEV *getDistance(unsigned Level) const { return nullptr; }
+    /// particular level. If Separate is set to true, information about
+    /// a separate level is provided.
+    virtual const SCEV *getDistance(unsigned Level,
+                                    bool Separate = false) const {
+      return nullptr;
+    }
 
     /// Check if the direction vector is negative. A negative direction
     /// vector means Src and Dst are reversed in the actual program.
@@ -171,21 +186,35 @@ namespace llvm {
     virtual bool normalize(ScalarEvolution *SE) { return false; }
 
     /// isPeelFirst - Returns true if peeling the first iteration from
-    /// this loop will break this dependence.
-    virtual bool isPeelFirst(unsigned Level) const { return false; }
+    /// this loop will break this dependence. If Separate is set to true,
+    /// information about a separate level is provided.
+    virtual bool isPeelFirst(unsigned Level, bool Separate = false) const {
+      return false;
+    }
 
     /// isPeelLast - Returns true if peeling the last iteration from
-    /// this loop will break this dependence.
-    virtual bool isPeelLast(unsigned Level) const { return false; }
+    /// this loop will break this dependence. If Separate is set to true,
+    /// information about a separate level is provided.
+    virtual bool isPeelLast(unsigned Level, bool Separate = false) const {
+      return false;
+    }
 
     /// isSplitable - Returns true if splitting this loop will break
-    /// the dependence.
-    virtual bool isSplitable(unsigned Level) const { return false; }
+    /// the dependence. If Separate is set to true, information about a
+    /// separate level is provided.
+    virtual bool isSplitable(unsigned Level, bool Separate = false) const {
+      return false;
+    }
+
+    /// inSeparateLoops - Returns true if this level is performed across
+    /// two separate loop nests.
+    virtual bool inSeparateLoops(unsigned Level) const { return false; }
 
     /// isScalar - Returns true if a particular level is scalar; that is,
     /// if no subscript in the source or destination mention the induction
-    /// variable associated with the loop at this level.
-    virtual bool isScalar(unsigned Level) const;
+    /// variable associated with the loop at this level. If Separate is
+    /// set to true, information about a separate level is provided.
+    virtual bool isScalar(unsigned Level, bool Separate = false) const;
 
     /// getNextPredecessor - Returns the value of the NextPredecessor
     /// field.
@@ -245,13 +274,33 @@ namespace llvm {
     /// source and destination of the dependence.
     unsigned getLevels() const override { return Levels; }
 
+    /// getSeparateLevels - Returns the number of separate loops surrounding
+    /// the source and destination of the dependence.
+    unsigned getSeparateLevels() const override { return SeparateLevels; }
+
+    /// getDVEntry - Returns the DV entry associated with a regular or a
+    /// separate level
+    DVEntry getDVEntry(unsigned Level, bool Separate) const {
+      if (!Separate) {
+        assert(0 < Level && Level <= Levels && "Level out of range");
+        return DV[Level - 1];
+      } else {
+        assert(Levels < Level && Level <= Levels + SeparateLevels &&
+               "Separate level out of range");
+        return DVSeparate[Level - Levels - 1];
+      }
+    }
+
     /// getDirection - Returns the direction associated with a particular
-    /// level.
-    unsigned getDirection(unsigned Level) const override;
+    /// level. If Separate is set to true, information about a separate
+    /// level is provided.
+    unsigned getDirection(unsigned Level, bool Separate = false) const override;
 
     /// getDistance - Returns the distance (or NULL) associated with a
-    /// particular level.
-    const SCEV *getDistance(unsigned Level) const override;
+    /// particular level. If Separate is set to true, information about
+    /// a separate level is provided.
+    const SCEV *getDistance(unsigned Level,
+                            bool Separate = false) const override;
 
     /// Check if the direction vector is negative. A negative direction
     /// vector means Src and Dst are reversed in the actual program.
@@ -264,27 +313,37 @@ namespace llvm {
     bool normalize(ScalarEvolution *SE) override;
 
     /// isPeelFirst - Returns true if peeling the first iteration from
-    /// this loop will break this dependence.
-    bool isPeelFirst(unsigned Level) const override;
+    /// this loop will break this dependence. If Separate is set to true,
+    /// information about a separate level is provided.
+    bool isPeelFirst(unsigned Level, bool Separate = false) const override;
 
     /// isPeelLast - Returns true if peeling the last iteration from
-    /// this loop will break this dependence.
-    bool isPeelLast(unsigned Level) const override;
+    /// this loop will break this dependence. If Separate is set to true,
+    /// information about a separate level is provided.
+    bool isPeelLast(unsigned Level, bool Separate = false) const override;
 
     /// isSplitable - Returns true if splitting the loop will break
-    /// the dependence.
-    bool isSplitable(unsigned Level) const override;
+    /// the dependence. If Separate is set to true, information about a
+    /// separate level is provided.
+    bool isSplitable(unsigned Level, bool Separate = false) const override;
+
+    /// inSeparateLoops - Returns true if this level is performed across
+    /// two separate loop nests.
+    bool inSeparateLoops(unsigned Level) const override;
 
     /// isScalar - Returns true if a particular level is scalar; that is,
     /// if no subscript in the source or destination mention the induction
-    /// variable associated with the loop at this level.
-    bool isScalar(unsigned Level) const override;
+    /// variable associated with the loop at this level. If Separate is
+    /// set to true, information about a separate level is provided.
+    bool isScalar(unsigned Level, bool Separate = false) const override;
 
   private:
     unsigned short Levels;
+    unsigned short SeparateLevels;
     bool LoopIndependent;
     bool Consistent; // Init to true, then refine.
     std::unique_ptr<DVEntry[]> DV;
+    std::unique_ptr<DVEntry[]> DVSeparate;
     friend class DependenceInfo;
   };
 
@@ -405,7 +464,8 @@ namespace llvm {
       const SCEV *A;
       const SCEV *B;
       const SCEV *C;
-      const Loop *AssociatedLoop;
+      const Loop *AssociatedSrcLoop;
+      const Loop *AssociatedDstLoop;
 
     public:
       /// isEmpty - Return true if the constraint is of kind Empty.
@@ -449,18 +509,25 @@ namespace llvm {
       /// Otherwise assert.
       const SCEV *getD() const;
 
-      /// getAssociatedLoop - Returns the loop associated with this constraint.
-      const Loop *getAssociatedLoop() const;
+      /// getAssociatedSrcLoop - Returns the source loop associated with this
+      /// constraint.
+      const Loop *getAssociatedSrcLoop() const;
+
+      /// getAssociatedDstLoop - Returns the destination loop associated with
+      /// this constraint.
+      const Loop *getAssociatedDstLoop() const;
 
       /// setPoint - Change a constraint to Point.
-      void setPoint(const SCEV *X, const SCEV *Y, const Loop *CurrentLoop);
+      void setPoint(const SCEV *X, const SCEV *Y, const Loop *CurrentSrcLoop,
+                    const Loop *CurrentDstLoop);
 
       /// setLine - Change a constraint to Line.
-      void setLine(const SCEV *A, const SCEV *B,
-                   const SCEV *C, const Loop *CurrentLoop);
+      void setLine(const SCEV *A, const SCEV *B, const SCEV *C,
+                   const Loop *CurrentSrcLoop, const Loop *CurrentDstLoop);
 
       /// setDistance - Change a constraint to Distance.
-      void setDistance(const SCEV *D, const Loop *CurrentLoop);
+      void setDistance(const SCEV *D, const Loop *CurrentSrcLoop,
+                       const Loop *CurrentDstLoop);
 
       /// setEmpty - Change a constraint to Empty.
       void setEmpty();
@@ -473,6 +540,10 @@ namespace llvm {
       void dump(raw_ostream &OS) const;
     };
 
+    /// Returns true if two loops are the same or they have the same tripcount
+    /// and depth
+    bool areLoopsSimilar(const Loop *SrcLoop, const Loop *DstLoop) const;
+
     /// establishNestingLevels - Examines the loop nesting of the Src and Dst
     /// instructions and establishes their shared loops. Sets the variables
     /// CommonLevels, SrcLevels, and MaxLevels.
@@ -523,10 +594,15 @@ namespace llvm {
     ///     e - 5
     ///     f - 6
     ///     g - 7 = MaxLevels
-    void establishNestingLevels(const Instruction *Src,
-                                const Instruction *Dst);
+    /// SeparateLevels counts the number of loop levels after the common levels
+    /// that are not identical but are considered similar. Two levels are
+    /// considered similar if they have the same trip count and the same
+    /// nesting depth.
+    /// For example, if loops `c` and `e` are similar, then they contribute to
+    /// the SeparateLevels count and SeparateLevels is set to 1.
+    void establishNestingLevels(const Instruction *Src, const Instruction *Dst);
 
-    unsigned CommonLevels, SrcLevels, MaxLevels;
+    unsigned CommonLevels, SrcLevels, MaxLevels, SeparateLevels;
 
     /// mapSrcLoop - Given one of the loops containing the source, return
     /// its level index in our numbering scheme.
@@ -665,13 +741,10 @@ namespace llvm {
     /// Returns true if any possible dependence is disproved.
     /// If there might be a dependence, returns false.
     /// Sets appropriate direction and distance.
-    bool strongSIVtest(const SCEV *Coeff,
-                       const SCEV *SrcConst,
-                       const SCEV *DstConst,
-                       const Loop *CurrentLoop,
-                       unsigned Level,
-                       FullDependence &Result,
-                       Constraint &NewConstraint) const;
+    bool strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
+                       const SCEV *DstConst, const Loop *CurrentSrcLoop,
+                       const Loop *CurrentDstLoop, unsigned Level,
+                       FullDependence &Result, Constraint &NewConstraint) const;
 
     /// weakCrossingSIVtest - Tests the weak-crossing SIV subscript pair
     /// (Src and Dst) for dependence.
@@ -683,13 +756,10 @@ namespace llvm {
     /// Sets appropriate direction entry.
     /// Set consistent to false.
     /// Marks the dependence as splitable.
-    bool weakCrossingSIVtest(const SCEV *SrcCoeff,
-                             const SCEV *SrcConst,
-                             const SCEV *DstConst,
-                             const Loop *CurrentLoop,
-                             unsigned Level,
-                             FullDependence &Result,
-                             Constraint &NewConstraint,
+    bool weakCrossingSIVtest(const SCEV *SrcCoeff, const SCEV *SrcConst,
+                             const SCEV *DstConst, const Loop *CurrentSrcLoop,
+                             const Loop *CurrentDstLoop, unsigned Level,
+                             FullDependence &Result, Constraint &NewConstraint,
                              const SCEV *&SplitIter) const;
 
     /// ExactSIVtest - Tests the SIV subscript pair
@@ -701,13 +771,10 @@ namespace llvm {
     /// If there might be a dependence, returns false.
     /// Sets appropriate direction entry.
     /// Set consistent to false.
-    bool exactSIVtest(const SCEV *SrcCoeff,
-                      const SCEV *DstCoeff,
-                      const SCEV *SrcConst,
-                      const SCEV *DstConst,
-                      const Loop *CurrentLoop,
-                      unsigned Level,
-                      FullDependence &Result,
+    bool exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
+                      const SCEV *SrcConst, const SCEV *DstConst,
+                      const Loop *CurrentSrcLoop, const Loop *CurrentDstLoop,
+                      unsigned Level, FullDependence &Result,
                       Constraint &NewConstraint) const;
 
     /// weakZeroSrcSIVtest - Tests the weak-zero SIV subscript pair
@@ -720,11 +787,9 @@ namespace llvm {
     /// Sets appropriate direction entry.
     /// Set consistent to false.
     /// If loop peeling will break the dependence, mark appropriately.
-    bool weakZeroSrcSIVtest(const SCEV *DstCoeff,
-                            const SCEV *SrcConst,
-                            const SCEV *DstConst,
-                            const Loop *CurrentLoop,
-                            unsigned Level,
+    bool weakZeroSrcSIVtest(const SCEV *DstCoeff, const SCEV *SrcConst,
+                            const SCEV *DstConst, const Loop *CurrentSrcLoop,
+                            const Loop *CurrentDstLoop, unsigned Level,
                             FullDependence &Result,
                             Constraint &NewConstraint) const;
 
@@ -738,11 +803,9 @@ namespace llvm {
     /// Sets appropriate direction entry.
     /// Set consistent to false.
     /// If loop peeling will break the dependence, mark appropriately.
-    bool weakZeroDstSIVtest(const SCEV *SrcCoeff,
-                            const SCEV *SrcConst,
-                            const SCEV *DstConst,
-                            const Loop *CurrentLoop,
-                            unsigned Level,
+    bool weakZeroDstSIVtest(const SCEV *SrcCoeff, const SCEV *SrcConst,
+                            const SCEV *DstConst, const Loop *CurrentSrcLoop,
+                            const Loop *CurrentDstLoop, unsigned Level,
                             FullDependence &Result,
                             Constraint &NewConstraint) const;
 
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 580cf43be3cb9..407d103d66c08 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -104,6 +104,7 @@ STATISTIC(GCDindependence, "GCD independence");
 STATISTIC(BanerjeeApplications, "Banerjee applications");
 STATISTIC(BanerjeeIndependence, "Banerjee independence");
 STATISTIC(BanerjeeSuccesses, "Banerjee successes");
+STATISTIC(SeparateLoopsConsidered, "Separate loops considered");
 
 static cl::opt<bool>
     Delinearize("da-delinearize", cl::init(true), cl::Hidden,
@@ -253,10 +254,7 @@ bool Dependence::isAnti() const {
 // if no subscript in the source or destination mention the induction
 // variable associated with the loop at this level.
 // Leave this out of line, so it will serve as a virtual method anchor
-bool Dependence::isScalar(unsigned level) const {
-  return false;
-}
-
+bool Dependence::isScalar(unsigned level, bool Separate) const { return false; }
 
 //===----------------------------------------------------------------------===//
 // FullDependence methods
@@ -330,50 +328,50 @@ bool FullDependence::normalize(ScalarEvolution *SE) {
 // The rest are simple getters that hide the implementation.
 
 // getDirection - Returns the direction associated with a particular level.
-unsigned FullDependence::getDirection(unsigned Level) const {
-  assert(0 < Level && Level <= Levels && "Level out of range");
-  return DV[Level - 1].Direction;
+// If Separate is set to true, information about a separate level is provided.
+unsigned FullDependence::getDirection(unsigned Level, bool Separate) const {
+  return getDVEntry(Level, Separate).Direction;
 }
 
-
-// Returns the distance (or NULL) associated with a particular level.
-const SCEV *FullDependence::getDistance(unsigned Level) const {
-  assert(0 < Level && Level <= Levels && "Level out of range");
-  return DV[Level - 1].Distance;
+// Returns the distance (or NULL) associated with a particular level. If
+// Separate is set to true, information about a separate level is provided.
+const SCEV *FullDependence::getDistance(unsigned Level, bool Separate) const {
+  return getDVEntry(Level, Separate).Distance;
 }
 
-
 // Returns true if a particular level is scalar; that is,
 // if no subscript in the source or destination mention the induction
-// variable associated with the loop at this level.
-bool FullDependence::isScalar(unsigned Level) const {
-  assert(0 < Level && Level <= Levels && "Level out of range");
-  return DV[Level - 1].Scalar;
+// variable associated with the loop at this level. If Separate is set
+// to true, information about a separate level is provided.
+bool FullDependence::isScalar(unsigned Level, bool Separate) const {
+  return getDVEntry(Level, Separate).Scalar;
 }
 
-
 // Returns true if peeling the first iteration from this loop
-// will break this dependence.
-bool FullDependence::isPeelFirst(unsigned Level) const {
-  assert(0 < Level && Level <= Levels && "Level out of range");
-  return DV[Level - 1].PeelFirst;
+// will break this dependence. If Separate is set to true, information
+// about a separate level is provided.
+bool FullDependence::isPeelFirst(unsigned Level, bool Separate) const {
+  return getDVEntry(Level, Separate).PeelFirst;
 }
 
-
 // Returns true if peeling the last iteration from this loop
-// will break this dependence.
-bool FullDependence::isPeelLast(unsigned Level) const {
-  assert(0 < Level && Level <= Levels && "Level out of range");
-  return DV[Level - 1].PeelLast;
+// will break this dependence. If Separate is set to true, information
+// about a separate level is provided.
+bool FullDependence::isPeelLast(unsigned Level, bool Separate) const {
+  return getDVEntry(Level, Separate).PeelLast;
 }
 
-
-// Returns true if splitting this loop will break the dependence.
-bool FullDependence::isSplitable(unsigned Level) const {
-  assert(0 < Level && Level <= Levels && "Level out of range");
-  return DV[Level - 1].Splitable;
+// Returns true if splitting this loop will break the dependence. If
+// Separate is set to true, information about a separate level is provided.
+bool FullDependence::isSplitable(unsigned Level, bool Separate) const {
+  return getDVEntry(Level, Separate).Splitable;
 }
 
+// Returns true if this level is performed across two separate loop nests.
+bool FullDependence::inSeparateLoops(unsigned Level) const {
+  assert(0 < Level && Level <= Levels + SeparateLevels && "Level out of range");
+  return Level > Levels;
+}
 
 //===----------------------------------------------------------------------===//
 // DependenceInfo::Constraint methods
@@ -428,38 +426,50 @@ const SCEV *DependenceInfo::Constraint::getD() const {
   return SE->getNegativeSCEV(C);
 }
 
+// Returns the source loop associated with this constraint.
+const Loop *DependenceInfo::Constraint::getAssociatedSrcLoop() const {
+  assert((Kind == Distance || Kind == Line || Kind == Point) &&
+         "Kind should be Distance, Line, or Point");
+  return AssociatedSrcLoop;
+}
 
-// Returns the loop associated with this constraint.
-const Loop *DependenceInfo::Constraint::getAssociatedLoop() const {
+// Returns the destination loop associated with this constraint.
+const Loop *DependenceInfo::Constraint::getAssociatedDstLoop() const {
   assert((Kind == Distance || Kind == Line || Kind == Point) &&
          "Kind should be Distance, Line, or Point");
-  return AssociatedLoop;
+  return AssociatedDstLoop;
 }
 
 void DependenceInfo::Constraint::setPoint(const SCEV *X, const SCEV *Y,
-                                          const Loop *CurLoop) {
+                                          const Loop *CurSrcLoop,
+                                          const Loop *CurDstLoop) {
   Kind = Point;
   A = X;
   B = Y;
-  AssociatedLoop = CurLoop;
+  AssociatedSrcLoop = CurSrcLoop;
+  AssociatedDstLoop = CurDstLoop;
 }
 
 void DependenceInfo::Constraint::setLine(const SCEV *AA, const SCEV *BB,
-                                         const SCEV *CC, const Loop *CurLoop) {
+                                         const SCEV *CC, const Loop *CurSrcLoop,
+                                         const Loop *CurDstLoop) {
   Kind = Line;
   A = AA;
   B = BB;
   C = CC;
-  AssociatedLoop = CurLoop;
+  AssociatedSrcLoop = CurSrcLoop;
+  AssociatedDstLoop = CurDstLoop;
 }
 
 void DependenceInfo::Constraint::setDistance(const SCEV *D,
-                                             const Loop *CurLoop) {
+                                             const Loop *CurSrcLoop,
+                                             const Loop *CurDstLoop) {
   Kind = Distance;
   A = SE->getOne(D->getType());
   B = SE->getNegativeSCEV(A);
   C = SE->getNegativeSCEV(D);
-  AssociatedLoop = CurLoop;
+  AssociatedSrcLoop = CurSrcLoop;
+  AssociatedDstLoop = CurDstLoop;
 }
 
 void DependenceInfo::Constraint::setEmpty() { Kind = Empty; }
@@ -606,8 +616,8 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
         ++DeltaSuccesses;
         return true;
       }
-      if (const SCEVConstant *CUB =
-          collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) {
+      if (const SCEVConstant *CUB = collectConstantUpperBound(
+              X->getAssociatedSrcLoop(), Prod1->getType())) {
         const APInt &UpperBound = CUB->getAPInt();
         LLVM_DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n");
         if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) {
@@ -616,9 +626,8 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
           return true;
         }
       }
-      X->setPoint(SE->getConstant(Xq),
-                  SE->getConstant(Yq),
-                  X->getAssociatedLoop());
+      X->setPoint(SE->getConstant(Xq), SE->getConstant(Yq),
+                  X->getAssociatedSrcLoop(), X->getAssociatedDstLoop());
       ++DeltaSuccesses;
       return true;
     }
@@ -654,6 +663,7 @@ bool DependenceInfo::intersectConstraints(Constraint *X, const Constraint *Y) {
 // For debugging purposes. Dumps a dependence to OS.
 void Dependence::dump(raw_ostream &OS) const {
   bool Splitable = false;
+  bool SeparatesStarted = false;
   if (isConfused())
     OS << "confused";
   else {
@@ -668,19 +678,24 @@ void Dependence::dump(raw_ostream &OS) const {
     else if (isInput())
       OS << "input";
     unsigned Levels = getLevels();
+    unsigned SeparateLevels = getSeparateLevels();
     OS << " [";
-    for (unsigned II = 1; II <= Levels; ++II) {
-      if (isSplitable(II))
+    for (unsigned II = 1; II <= Levels + SeparateLevels; ++II) {
+      if (!SeparatesStarted && inSeparateLoops(II)) {
+        SeparatesStarted = true;
+        OS << "/ ";
+      }
+      if (isSplitable(II, SeparatesStarted))
         Splitable = true;
-      if (isPeelFirst(II))
+      if (isPeelFirst(II, SeparatesStarted))
         OS << 'p';
-      const SCEV *Distance = getDistance(II);
+      const SCEV *Distance = getDistance(II, SeparatesStarted);
       if (Distance)
         OS << *Distance;
-      else if (isScalar(II))
+      else if (isScalar(II, SeparatesStarted))
         OS << "S";
       else {
-        unsigned Direction = getDirection(II);
+        unsigned Direction = getDirection(II, SeparatesStarted);
         if (Direction == DVEntry::ALL)
           OS << "*";
         else {
@@ -692,9 +707,9 @@ void Dependence::dump(raw_ostream &OS) const {
             OS << ">";
         }
       }
-      if (isPeelLast(II))
+      if (isPeelLast(II, SeparatesStarted))
         OS << 'p';
-      if (II < Levels)
+      if (II < Levels + SeparateLevels)
         OS << " ";
     }
     if (isLoopIndependent())
@@ -756,6 +771,34 @@ bool isLoadOrStore(const Instruction *I) {
   return false;
 }
 
+// Returns true if two loops are the same or they have the same tripcount and
+// depth
+bool DependenceInfo::areLoopsSimilar(const Loop *SrcLoop,
+                                     const Loop *DstLoop) const {
+  if (SrcLoop == DstLoop)
+    return true;
+
+  if (SrcLoop->getLoopDepth() != DstLoop->getLoopDepth())
+    return false;
+
+  if (!SrcLoop || !SrcLoop->getLoopLatch() || !DstLoop ||
+      !DstLoop->getLoopLatch())
+    return false;
+
+  const SCEV *SrcUB, *DstUP;
+  if (SE->hasLoopInvariantBackedgeTakenCount(SrcLoop))
+    SrcUB = SE->getBackedgeTakenCount(SrcLoop);
+  if (SE->hasLoopInvariantBackedgeTakenCount(DstLoop))
+    DstUP = SE->getBackedgeTakenCount(DstLoop);
+
+  if (SrcUB == nullptr || DstUP == nullptr)
+    return false;
+
+  if (SE->isKnownPredicate(ICmpInst::ICMP_EQ, SrcUB, DstUP))
+    return true;
+
+  return false;
+}
 
 // Examines the loop nesting of the Src and Dst
 // instructions and establishes their shared loops. Sets the variables
@@ -807,6 +850,11 @@ bool isLoadOrStore(const Instruction *I) {
 //     e - 5
 //     f - 6
 //     g - 7 = MaxLevels
+// SeparateLevels counts the number of levels after common levels that are
+// not common but are similar, meaning that they have the same tripcount
+// and depth. Assume that in this code fragment, levels c and e are
+// similar. In this case only the loop nests at the next level after
+// common levels are similar, and SeparateLevel is set to 1.
 void DependenceInfo::establishNestingLevels(const Instruction *Src,
                                             const Instruction *Dst) {
   const BasicBlock *SrcBlock = Src->getParent();
@@ -817,6 +865,7 @@ void DependenceInfo::establishNestingLevels(const Instruction *Src,
   const Loop *DstLoop = LI->getLoopFor(DstBlock);
   SrcLevels = SrcLevel;
   MaxLevels = SrcLevel + DstLevel;
+  SeparateLevels = 0;
   while (SrcLevel > DstLevel) {
     SrcLoop = SrcLoop->getParentLoop();
     SrcLevel--;
@@ -825,16 +874,23 @@ void DependenceInfo::establishNestingLevels(const Instruction *Src,
     DstLoop = DstLoop->getParentLoop();
     DstLevel--;
   }
+  // find the first separate similar level
+  while (!areLoopsSimilar(SrcLoop, DstLoop)) {
+    SrcLoop = SrcLoop->getParentLoop();
+    DstLoop = DstLoop->getParentLoop();
+    SrcLevel--;
+  }
+  // continue to find the first common level
   while (SrcLoop != DstLoop) {
     SrcLoop = SrcLoop->getParentLoop();
     DstLoop = DstLoop->getParentLoop();
     SrcLevel--;
+    SeparateLevels++;
   }
   CommonLevels = SrcLevel;
   MaxLevels -= CommonLevels;
 }
 
-
 // Given one of the loops containing the source, return
 // its level index in our numbering scheme.
 unsigned DependenceInfo::mapSrcLoop(const Loop *SrcLoop) const {
@@ -1221,8 +1277,9 @@ bool DependenceInfo::testZIV(const SCEV *Src, const SCEV *Dst,
 //
 // Return true if dependence disproved.
 bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
-                                   const SCEV *DstConst, const Loop *CurLoop,
-                                   unsigned Level, FullDependence &Result,
+                                   const SCEV *DstConst, const Loop *CurSrcLoop,
+                                   const Loop *CurDstLoop, unsigned Level,
+                                   FullDependence &Result,
                                    Constraint &NewConstraint) const {
   LLVM_DEBUG(dbgs() << "\tStrong SIV test\n");
   LLVM_DEBUG(dbgs() << "\t    Coeff = " << *Coeff);
@@ -1240,7 +1297,8 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n");
 
   // check that |Delta| < iteration count
-  if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
+  if (const SCEV *UpperBound =
+          collectUpperBound(CurSrcLoop, Delta->getType())) {
     LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound);
     LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n");
     const SCEV *AbsDelta =
@@ -1273,7 +1331,8 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
       return true;
     }
     Result.DV[Level].Distance = SE->getConstant(Distance);
-    NewConstraint.setDistance(SE->getConstant(Distance), CurLoop);
+    NewConstraint.setDistance(SE->getConstant(Distance), CurSrcLoop,
+                              CurDstLoop);
     if (Distance.sgt(0))
       Result.DV[Level].Direction &= Dependence::DVEntry::LT;
     else if (Distance.slt(0))
@@ -1285,7 +1344,7 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   else if (Delta->isZero()) {
     // since 0/X == 0
     Result.DV[Level].Distance = Delta;
-    NewConstraint.setDistance(Delta, CurLoop);
+    NewConstraint.setDistance(Delta, CurSrcLoop, CurDstLoop);
     Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
     ++StrongSIVsuccesses;
   }
@@ -1293,13 +1352,12 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
     if (Coeff->isOne()) {
       LLVM_DEBUG(dbgs() << "\t    Distance = " << *Delta << "\n");
       Result.DV[Level].Distance = Delta; // since X/1 == X
-      NewConstraint.setDistance(Delta, CurLoop);
+      NewConstraint.setDistance(Delta, CurSrcLoop, CurDstLoop);
     }
     else {
       Result.Consistent = false;
-      NewConstraint.setLine(Coeff,
-                            SE->getNegativeSCEV(Coeff),
-                            SE->getNegativeSCEV(Delta), CurLoop);
+      NewConstraint.setLine(Coeff, SE->getNegativeSCEV(Coeff),
+                            SE->getNegativeSCEV(Delta), CurSrcLoop, CurDstLoop);
     }
 
     // maybe we can get a useful direction
@@ -1327,7 +1385,6 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   return false;
 }
 
-
 // weakCrossingSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.2
 //
@@ -1358,8 +1415,9 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
 // Return true if dependence disproved.
 bool DependenceInfo::weakCrossingSIVtest(
     const SCEV *Coeff, const SCEV *SrcConst, const SCEV *DstConst,
-    const Loop *CurLoop, unsigned Level, FullDependence &Result,
-    Constraint &NewConstraint, const SCEV *&SplitIter) const {
+    const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level,
+    FullDependence &Result, Constraint &NewConstraint,
+    const SCEV *&SplitIter) const {
   LLVM_DEBUG(dbgs() << "\tWeak-Crossing SIV test\n");
   LLVM_DEBUG(dbgs() << "\t    Coeff = " << *Coeff << "\n");
   LLVM_DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
@@ -1370,7 +1428,7 @@ bool DependenceInfo::weakCrossingSIVtest(
   Result.Consistent = false;
   const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
   LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
-  NewConstraint.setLine(Coeff, Coeff, Delta, CurLoop);
+  NewConstraint.setLine(Coeff, Coeff, Delta, CurSrcLoop, CurDstLoop);
   if (Delta->isZero()) {
     Result.DV[Level].Direction &= ~Dependence::DVEntry::LT;
     Result.DV[Level].Direction &= ~Dependence::DVEntry::GT;
@@ -1418,7 +1476,8 @@ bool DependenceInfo::weakCrossingSIVtest(
 
   // We're certain that Delta > 0 and ConstCoeff > 0.
   // Check Delta/(2*ConstCoeff) against upper loop bound
-  if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
+  if (const SCEV *UpperBound =
+          collectUpperBound(CurSrcLoop, Delta->getType())) {
     LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
     const SCEV *ConstantTwo = SE->getConstant(UpperBound->getType(), 2);
     const SCEV *ML = SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound),
@@ -1472,7 +1531,6 @@ bool DependenceInfo::weakCrossingSIVtest(
   return false;
 }
 
-
 // Kirch's algorithm, from
 //
 //        Optimizing Supercompilers for Supercomputers
@@ -1558,7 +1616,8 @@ static APInt ceilingOfQuotient(const APInt &A, const APInt &B) {
 // returns all the dependencies that exist between Dst and Src.
 bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
                                   const SCEV *SrcConst, const SCEV *DstConst,
-                                  const Loop *CurLoop, unsigned Level,
+                                  const Loop *CurSrcLoop,
+                                  const Loop *CurDstLoop, unsigned Level,
                                   FullDependence &Result,
                                   Constraint &NewConstraint) const {
   LLVM_DEBUG(dbgs() << "\tExact SIV test\n");
@@ -1573,7 +1632,7 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
   LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
   NewConstraint.setLine(SrcCoeff, SE->getNegativeSCEV(DstCoeff), Delta,
-                        CurLoop);
+                        CurSrcLoop, CurDstLoop);
   const SCEVConstant *ConstDelta = dyn_cast<SCEVConstant>(Delta);
   const SCEVConstant *ConstSrcCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
   const SCEVConstant *ConstDstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
@@ -1600,7 +1659,7 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   bool UMValid = false;
   // UM is perhaps unavailable, let's check
   if (const SCEVConstant *CUB =
-          collectConstantUpperBound(CurLoop, Delta->getType())) {
+          collectConstantUpperBound(CurSrcLoop, Delta->getType())) {
     UM = CUB->getAPInt();
     LLVM_DEBUG(dbgs() << "\t    UM = " << UM << "\n");
     UMValid = true;
@@ -1707,7 +1766,6 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   return Result.DV[Level].Direction == Dependence::DVEntry::NONE;
 }
 
-
 // Return true if the divisor evenly divides the dividend.
 static
 bool isRemainderZero(const SCEVConstant *Dividend,
@@ -1749,12 +1807,10 @@ bool isRemainderZero(const SCEVConstant *Dividend,
 // (see also weakZeroDstSIVtest)
 //
 // Return true if dependence disproved.
-bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
-                                        const SCEV *SrcConst,
-                                        const SCEV *DstConst,
-                                        const Loop *CurLoop, unsigned Level,
-                                        FullDependence &Result,
-                                        Constraint &NewConstraint) const {
+bool DependenceInfo::weakZeroSrcSIVtest(
+    const SCEV *DstCoeff, const SCEV *SrcConst, const SCEV *DstConst,
+    const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level,
+    FullDependence &Result, Constraint &NewConstraint) const {
   // For the WeakSIV test, it's possible the loop isn't common to
   // the Src and Dst loops. If it isn't, then there's no need to
   // record a direction.
@@ -1768,7 +1824,7 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
   Result.Consistent = false;
   const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst);
   NewConstraint.setLine(SE->getZero(Delta->getType()), DstCoeff, Delta,
-                        CurLoop);
+                        CurSrcLoop, CurDstLoop);
   LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
   if (isKnownPredicate(CmpInst::ICMP_EQ, SrcConst, DstConst)) {
     if (Level < CommonLevels) {
@@ -1789,7 +1845,8 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
 
   // check that Delta/SrcCoeff < iteration count
   // really check NewDelta < count*AbsCoeff
-  if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
+  if (const SCEV *UpperBound =
+          collectUpperBound(CurSrcLoop, Delta->getType())) {
     LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
     const SCEV *Product = SE->getMulExpr(AbsCoeff, UpperBound);
     if (isKnownPredicate(CmpInst::ICMP_SGT, NewDelta, Product)) {
@@ -1827,7 +1884,6 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
   return false;
 }
 
-
 // weakZeroDstSIVtest -
 // From the paper, Practical Dependence Testing, Section 4.2.2
 //
@@ -1859,12 +1915,10 @@ bool DependenceInfo::weakZeroSrcSIVtest(const SCEV *DstCoeff,
 // (see also weakZeroSrcSIVtest)
 //
 // Return true if dependence disproved.
-bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
-                                        const SCEV *SrcConst,
-                                        const SCEV *DstConst,
-                                        const Loop *CurLoop, unsigned Level,
-                                        FullDependence &Result,
-                                        Constraint &NewConstraint) const {
+bool DependenceInfo::weakZeroDstSIVtest(
+    const SCEV *SrcCoeff, const SCEV *SrcConst, const SCEV *DstConst,
+    const Loop *CurSrcLoop, const Loop *CurDstLoop, unsigned Level,
+    FullDependence &Result, Constraint &NewConstraint) const {
   // For the WeakSIV test, it's possible the loop isn't common to the
   // Src and Dst loops. If it isn't, then there's no need to record a direction.
   LLVM_DEBUG(dbgs() << "\tWeak-Zero (dst) SIV test\n");
@@ -1877,7 +1931,7 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
   Result.Consistent = false;
   const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
   NewConstraint.setLine(SrcCoeff, SE->getZero(Delta->getType()), Delta,
-                        CurLoop);
+                        CurSrcLoop, CurDstLoop);
   LLVM_DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
   if (isKnownPredicate(CmpInst::ICMP_EQ, DstConst, SrcConst)) {
     if (Level < CommonLevels) {
@@ -1898,7 +1952,8 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
 
   // check that Delta/SrcCoeff < iteration count
   // really check NewDelta < count*AbsCoeff
-  if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
+  if (const SCEV *UpperBound =
+          collectUpperBound(CurSrcLoop, Delta->getType())) {
     LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
     const SCEV *Product = SE->getMulExpr(AbsCoeff, UpperBound);
     if (isKnownPredicate(CmpInst::ICMP_SGT, NewDelta, Product)) {
@@ -1936,7 +1991,6 @@ bool DependenceInfo::weakZeroDstSIVtest(const SCEV *SrcCoeff,
   return false;
 }
 
-
 // exactRDIVtest - Tests the RDIV subscript pair for dependence.
 // Things of the form [c1 + a*i] and [c2 + b*j],
 // where i and j are induction variable, c1 and c2 are loop invariant,
@@ -2228,43 +2282,47 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
     const SCEV *DstConst = DstAddRec->getStart();
     const SCEV *SrcCoeff = SrcAddRec->getStepRecurrence(*SE);
     const SCEV *DstCoeff = DstAddRec->getStepRecurrence(*SE);
-    const Loop *CurLoop = SrcAddRec->getLoop();
-    assert(CurLoop == DstAddRec->getLoop() &&
-           "both loops in SIV should be same");
-    Level = mapSrcLoop(CurLoop);
+    const Loop *CurSrcLoop = SrcAddRec->getLoop();
+    const Loop *CurDstLoop = DstAddRec->getLoop();
+    assert(areLoopsSimilar(CurSrcLoop, CurDstLoop) &&
+           "both loops in SIV should be the same or have the same tripcount "
+           "and depth");
+    Level = mapSrcLoop(CurSrcLoop);
     bool disproven;
     if (SrcCoeff == DstCoeff)
-      disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
-                                Level, Result, NewConstraint);
+      disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurSrcLoop,
+                                CurDstLoop, Level, Result, NewConstraint);
     else if (SrcCoeff == SE->getNegativeSCEV(DstCoeff))
-      disproven = weakCrossingSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
-                                      Level, Result, NewConstraint, SplitIter);
+      disproven = weakCrossingSIVtest(SrcCoeff, SrcConst, DstConst, CurSrcLoop,
+                                      CurDstLoop, Level, Result, NewConstraint,
+                                      SplitIter);
     else
-      disproven = exactSIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop,
-                               Level, Result, NewConstraint);
-    return disproven ||
-      gcdMIVtest(Src, Dst, Result) ||
-      symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, CurLoop);
+      disproven =
+          exactSIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurSrcLoop,
+                       CurDstLoop, Level, Result, NewConstraint);
+    return disproven || gcdMIVtest(Src, Dst, Result) ||
+           symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurSrcLoop,
+                            CurDstLoop);
   }
   if (SrcAddRec) {
     const SCEV *SrcConst = SrcAddRec->getStart();
     const SCEV *SrcCoeff = SrcAddRec->getStepRecurrence(*SE);
     const SCEV *DstConst = Dst;
-    const Loop *CurLoop = SrcAddRec->getLoop();
-    Level = mapSrcLoop(CurLoop);
-    return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
-                              Level, Result, NewConstraint) ||
-      gcdMIVtest(Src, Dst, Result);
+    const Loop *CurSrcLoop = SrcAddRec->getLoop();
+    Level = mapSrcLoop(CurSrcLoop);
+    return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurSrcLoop,
+                              CurSrcLoop, Level, Result, NewConstraint) ||
+           gcdMIVtest(Src, Dst, Result);
   }
   if (DstAddRec) {
     const SCEV *DstConst = DstAddRec->getStart();
     const SCEV *DstCoeff = DstAddRec->getStepRecurrence(*SE);
     const SCEV *SrcConst = Src;
-    const Loop *CurLoop = DstAddRec->getLoop();
-    Level = mapDstLoop(CurLoop);
-    return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst,
-                              CurLoop, Level, Result, NewConstraint) ||
-      gcdMIVtest(Src, Dst, Result);
+    const Loop *CurDstLoop = DstAddRec->getLoop();
+    Level = mapDstLoop(CurDstLoop);
+    return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst, CurDstLoop,
+                              CurDstLoop, Level, Result, NewConstraint) ||
+           gcdMIVtest(Src, Dst, Result);
   }
   llvm_unreachable("SIV test expected at least one AddRec");
   return false;
@@ -3169,19 +3227,20 @@ bool DependenceInfo::propagate(const SCEV *&Src, const SCEV *&Dst,
 bool DependenceInfo::propagateDistance(const SCEV *&Src, const SCEV *&Dst,
                                        Constraint &CurConstraint,
                                        bool &Consistent) {
-  const Loop *CurLoop = CurConstraint.getAssociatedLoop();
+  const Loop *CurSrcLoop = CurConstraint.getAssociatedSrcLoop();
+  const Loop *CurDstLoop = CurConstraint.getAssociatedDstLoop();
   LLVM_DEBUG(dbgs() << "\t\tSrc is " << *Src << "\n");
-  const SCEV *A_K = findCoefficient(Src, CurLoop);
+  const SCEV *A_K = findCoefficient(Src, CurSrcLoop);
   if (A_K->isZero())
     return false;
   const SCEV *DA_K = SE->getMulExpr(A_K, CurConstraint.getD());
   Src = SE->getMinusSCEV(Src, DA_K);
-  Src = zeroCoefficient(Src, CurLoop);
+  Src = zeroCoefficient(Src, CurSrcLoop);
   LLVM_DEBUG(dbgs() << "\t\tnew Src is " << *Src << "\n");
   LLVM_DEBUG(dbgs() << "\t\tDst is " << *Dst << "\n");
-  Dst = addToCoefficient(Dst, CurLoop, SE->getNegativeSCEV(A_K));
+  Dst = addToCoefficient(Dst, CurDstLoop, SE->getNegativeSCEV(A_K));
   LLVM_DEBUG(dbgs() << "\t\tnew Dst is " << *Dst << "\n");
-  if (!findCoefficient(Dst, CurLoop)->isZero())
+  if (!findCoefficient(Dst, CurDstLoop)->isZero())
     Consistent = false;
   return true;
 }
@@ -3195,7 +3254,8 @@ bool DependenceInfo::propagateDistance(const SCEV *&Src, const SCEV *&Dst,
 bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
                                    Constraint &CurConstraint,
                                    bool &Consistent) {
-  const Loop *CurLoop = CurConstraint.getAssociatedLoop();
+  const Loop *CurSrcLoop = CurConstraint.getAssociatedSrcLoop();
+  const Loop *CurDstLoop = CurConstraint.getAssociatedDstLoop();
   const SCEV *A = CurConstraint.getA();
   const SCEV *B = CurConstraint.getB();
   const SCEV *C = CurConstraint.getC();
@@ -3211,11 +3271,11 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
     APInt Charlie = Cconst->getAPInt();
     APInt CdivB = Charlie.sdiv(Beta);
     assert(Charlie.srem(Beta) == 0 && "C should be evenly divisible by B");
-    const SCEV *AP_K = findCoefficient(Dst, CurLoop);
+    const SCEV *AP_K = findCoefficient(Dst, CurDstLoop);
     //    Src = SE->getAddExpr(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
     Src = SE->getMinusSCEV(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
-    Dst = zeroCoefficient(Dst, CurLoop);
-    if (!findCoefficient(Src, CurLoop)->isZero())
+    Dst = zeroCoefficient(Dst, CurDstLoop);
+    if (!findCoefficient(Src, CurSrcLoop)->isZero())
       Consistent = false;
   }
   else if (B->isZero()) {
@@ -3226,10 +3286,10 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
     APInt Charlie = Cconst->getAPInt();
     APInt CdivA = Charlie.sdiv(Alpha);
     assert(Charlie.srem(Alpha) == 0 && "C should be evenly divisible by A");
-    const SCEV *A_K = findCoefficient(Src, CurLoop);
+    const SCEV *A_K = findCoefficient(Src, CurSrcLoop);
     Src = SE->getAddExpr(Src, SE->getMulExpr(A_K, SE->getConstant(CdivA)));
-    Src = zeroCoefficient(Src, CurLoop);
-    if (!findCoefficient(Dst, CurLoop)->isZero())
+    Src = zeroCoefficient(Src, CurSrcLoop);
+    if (!findCoefficient(Dst, CurDstLoop)->isZero())
       Consistent = false;
   }
   else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) {
@@ -3240,22 +3300,22 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
     APInt Charlie = Cconst->getAPInt();
     APInt CdivA = Charlie.sdiv(Alpha);
     assert(Charlie.srem(Alpha) == 0 && "C should be evenly divisible by A");
-    const SCEV *A_K = findCoefficient(Src, CurLoop);
+    const SCEV *A_K = findCoefficient(Src, CurSrcLoop);
     Src = SE->getAddExpr(Src, SE->getMulExpr(A_K, SE->getConstant(CdivA)));
-    Src = zeroCoefficient(Src, CurLoop);
-    Dst = addToCoefficient(Dst, CurLoop, A_K);
-    if (!findCoefficient(Dst, CurLoop)->isZero())
+    Src = zeroCoefficient(Src, CurSrcLoop);
+    Dst = addToCoefficient(Dst, CurDstLoop, A_K);
+    if (!findCoefficient(Dst, CurDstLoop)->isZero())
       Consistent = false;
   }
   else {
     // paper is incorrect here, or perhaps just misleading
-    const SCEV *A_K = findCoefficient(Src, CurLoop);
+    const SCEV *A_K = findCoefficient(Src, CurSrcLoop);
     Src = SE->getMulExpr(Src, A);
     Dst = SE->getMulExpr(Dst, A);
     Src = SE->getAddExpr(Src, SE->getMulExpr(A_K, C));
-    Src = zeroCoefficient(Src, CurLoop);
-    Dst = addToCoefficient(Dst, CurLoop, SE->getMulExpr(A_K, B));
-    if (!findCoefficient(Dst, CurLoop)->isZero())
+    Src = zeroCoefficient(Src, CurSrcLoop);
+    Dst = addToCoefficient(Dst, CurDstLoop, SE->getMulExpr(A_K, B));
+    if (!findCoefficient(Dst, CurDstLoop)->isZero())
       Consistent = false;
   }
   LLVM_DEBUG(dbgs() << "\t\tnew Src = " << *Src << "\n");
@@ -3269,17 +3329,18 @@ bool DependenceInfo::propagateLine(const SCEV *&Src, const SCEV *&Dst,
 // Return true if some simplification occurs.
 bool DependenceInfo::propagatePoint(const SCEV *&Src, const SCEV *&Dst,
                                     Constraint &CurConstraint) {
-  const Loop *CurLoop = CurConstraint.getAssociatedLoop();
-  const SCEV *A_K = findCoefficient(Src, CurLoop);
-  const SCEV *AP_K = findCoefficient(Dst, CurLoop);
+  const Loop *CurSrcLoop = CurConstraint.getAssociatedSrcLoop();
+  const Loop *CurDstLoop = CurConstraint.getAssociatedDstLoop();
+  const SCEV *A_K = findCoefficient(Src, CurSrcLoop);
+  const SCEV *AP_K = findCoefficient(Dst, CurDstLoop);
   const SCEV *XA_K = SE->getMulExpr(A_K, CurConstraint.getX());
   const SCEV *YAP_K = SE->getMulExpr(AP_K, CurConstraint.getY());
   LLVM_DEBUG(dbgs() << "\t\tSrc is " << *Src << "\n");
   Src = SE->getAddExpr(Src, SE->getMinusSCEV(XA_K, YAP_K));
-  Src = zeroCoefficient(Src, CurLoop);
+  Src = zeroCoefficient(Src, CurSrcLoop);
   LLVM_DEBUG(dbgs() << "\t\tnew Src is " << *Src << "\n");
   LLVM_DEBUG(dbgs() << "\t\tDst is " << *Dst << "\n");
-  Dst = zeroCoefficient(Dst, CurLoop);
+  Dst = zeroCoefficient(Dst, CurDstLoop);
   LLVM_DEBUG(dbgs() << "\t\tnew Dst is " << *Dst << "\n");
   return true;
 }
@@ -3615,14 +3676,6 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst) {
     break; // The underlying objects alias; test accesses for dependence.
   }
 
-  // establish loop nesting levels
-  establishNestingLevels(Src, Dst);
-  LLVM_DEBUG(dbgs() << "    common nesting levels = " << CommonLevels << "\n");
-  LLVM_DEBUG(dbgs() << "    maximum nesting levels = " << MaxLevels << "\n");
-
-  FullDependence Result(Src, Dst, PossiblyLoopIndependent, CommonLevels);
-  ++TotalArrayPairs;
-
   unsigned Pairs = 1;
   SmallVector<Subscript, 2> Pair(Pairs);
   const SCEV *SrcSCEV = SE->getSCEV(SrcPtr);
@@ -3649,6 +3702,46 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst) {
     }
   }
 
+  // Establish loop nesting levels considering separate but similar loops as
+  // common
+  establishNestingLevels(Src, Dst);
+
+  LLVM_DEBUG(dbgs() << "    common nesting levels = " << CommonLevels << "\n");
+  LLVM_DEBUG(dbgs() << "    maximum nesting levels = " << MaxLevels << "\n");
+  LLVM_DEBUG(dbgs() << "    separate nesting levels = " << SeparateLevels
+                    << "\n");
+
+  // Modify common levels to consider the similar separate levels in the tests
+  CommonLevels += SeparateLevels;
+  MaxLevels -= SeparateLevels;
+  if (SeparateLevels > 0) {
+    // Not all tests are handled yet over separate loops
+    // Revoke if there are any tests other than ZIV, SIV or RDIV
+    for (unsigned P = 0; P < Pairs; ++P) {
+      Pair[P].Loops.resize(MaxLevels + 1);
+      Subscript::ClassificationKind TestClass = classifyPair(
+          Pair[P].Src, LI->getLoopFor(Src->getParent()), Pair[P].Dst,
+          LI->getLoopFor(Dst->getParent()), Pair[P].Loops);
+
+      if (TestClass != Subscript::ZIV && TestClass != Subscript::SIV &&
+          TestClass != Subscript::RDIV) {
+        // Revert the levels to not consider the separate levels
+        CommonLevels -= SeparateLevels;
+        MaxLevels += SeparateLevels;
+        SeparateLevels = 0;
+        break;
+      }
+    }
+  }
+
+  FullDependence Result(Src, Dst, PossiblyLoopIndependent, CommonLevels);
+  ++TotalArrayPairs;
+
+  if (SeparateLevels > 0) {
+    Result.Consistent = false;
+    SeparateLoopsConsidered++;
+  }
+
   for (unsigned P = 0; P < Pairs; ++P) {
     Pair[P].Loops.resize(MaxLevels + 1);
     Pair[P].GroupLoops.resize(MaxLevels + 1);
@@ -3938,6 +4031,25 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst) {
     if (CompleteLoops[II])
       Result.DV[II - 1].Scalar = false;
 
+  if (SeparateLevels > 0) {
+    // Extracting separate levels from the common levels
+    // Reverting CommonLevels and MaxLevels to their original values
+    assert(CommonLevels >= SeparateLevels);
+    CommonLevels -= SeparateLevels;
+    MaxLevels += SeparateLevels;
+    std::unique_ptr<FullDependence::DVEntry[]> DV, DVSeparate;
+    DV = std::make_unique<FullDependence::DVEntry[]>(CommonLevels);
+    DVSeparate = std::make_unique<FullDependence::DVEntry[]>(SeparateLevels);
+    for (unsigned level = 0; level < CommonLevels; ++level)
+      DV[level] = Result.DV[level];
+    for (unsigned level = 0; level < SeparateLevels; ++level)
+      DVSeparate[level] = Result.DV[CommonLevels + level];
+    Result.DV = std::move(DV);
+    Result.DVSeparate = std::move(DVSeparate);
+    Result.Levels = CommonLevels;
+    Result.SeparateLevels = SeparateLevels;
+  }
+
   if (PossiblyLoopIndependent) {
     // Make sure the LoopIndependent flag is set correctly.
     // All directions must include equal, otherwise no
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 5bba3016ba4a1..03ee90807c64f 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -100,6 +100,8 @@ STATISTIC(OnlySecondCandidateIsGuarded,
           "The second candidate is guarded while the first one is not");
 STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions.");
 STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions.");
+STATISTIC(NumDepSafeFused, "Number of fused loops with dependencies "
+                           "proven safe based on the dependence direction");
 
 enum FusionDependenceAnalysisChoice {
   FUSION_DEPENDENCE_ANALYSIS_SCEV,
@@ -1348,6 +1350,33 @@ struct LoopFuser {
                           << "\n");
       }
 #endif
+      unsigned Levels = DepResult->getLevels();
+      unsigned SeparateLevels = DepResult->getSeparateLevels();
+      unsigned CurLoopLevel = FC0.L->getLoopDepth();
+
+      bool OuterEqDir = true;
+      for (unsigned II = 1; II <= std::min(CurLoopLevel - 1, Levels); ++II) {
+        unsigned Direction = DepResult->getDirection(II, II > Levels);
+        if (!(Direction & Dependence::DVEntry::EQ)) {
+          // Different accesses in the outer levels of CurLoopLevel
+          OuterEqDir = false;
+          break;
+        }
+      }
+      if (!OuterEqDir || CurLoopLevel > Levels + SeparateLevels) {
+        LLVM_DEBUG(dbgs() << "Safe to fuse with no dependency\n");
+        NumDepSafeFused++;
+        return true;
+      }
+
+      assert(CurLoopLevel > Levels && "Fusion candidates are not separated");
+      unsigned CurDir = DepResult->getDirection(CurLoopLevel, true);
+      if (!(CurDir & Dependence::DVEntry::GT)) {
+        LLVM_DEBUG(dbgs() << "Safe to fuse with backward loop-carried "
+                             "dependency\n");
+        NumDepSafeFused++;
+        return true;
+      }
 
       if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
         LLVM_DEBUG(
diff --git a/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll b/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll
index 404018707c0a5..0811a56b495c4 100644
--- a/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/PreliminaryNoValidityCheckFixedSize.ll
@@ -20,7 +20,7 @@ define void @p2(i64 %n, ptr %A, ptr %B) nounwind uwtable ssp {
 ; CHECK-NEXT:  Src: store i64 %i.011, ptr %arrayidx8, align 8 --> Dst: store i64 %i.011, ptr %arrayidx8, align 8
 ; CHECK-NEXT:    da analyze - none!
 ; CHECK-NEXT:  Src: store i64 %i.011, ptr %arrayidx8, align 8 --> Dst: %0 = load i64, ptr %arrayidx17, align 8
-; CHECK-NEXT:    da analyze - flow [-3 -2]!
+; CHECK-NEXT:    da analyze - flow [-3 -2 / -1]!
 ; CHECK-NEXT:  Src: store i64 %i.011, ptr %arrayidx8, align 8 --> Dst: store i64 %0, ptr %B.addr.24, align 8
 ; CHECK-NEXT:    da analyze - confused!
 ; CHECK-NEXT:  Src: %0 = load i64, ptr %arrayidx17, align 8 --> Dst: %0 = load i64, ptr %arrayidx17, align 8
diff --git a/llvm/test/Analysis/DependenceAnalysis/SIVSeparateLoops.ll b/llvm/test/Analysis/DependenceAnalysis/SIVSeparateLoops.ll
new file mode 100644
index 0000000000000..484997b431996
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/SIVSeparateLoops.ll
@@ -0,0 +1,145 @@
+; RUN: opt < %s -disable-output "-passes=print<da>" -aa-pipeline=basic-aa 2>&1 \
+; RUN:   -da-disable-delinearization-checks | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+;;  for (long int i = 0; i < n; i++) {
+;;    for (long int j = 0; j < n; j++) {
+;;      for (long int k = 0; k < n; k++) {
+;;        for (long int l = 0; l < n; l++)
+;;          A[i][j][k][l] = i;
+;;      }
+;;      for (long int k = 1; k < n+1; k++) {
+;;        for (long int l = 0; l < n; l++)
+;;          *B++ = A[i + 4][j + 3][k + 2][l + 1];
+
+define void @SIVSeparate(i64 %n, ptr %A, ptr %B) nounwind uwtable ssp {
+entry:
+  %cmp10 = icmp sgt i64 %n, 0
+  br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end35
+  
+; CHECK-LABEL: SIVSeparate
+; CHECK: da analyze - none!
+; CHECK: da analyze - flow [-4 -3 / -3 -1]!   
+; CHECK: da analyze - confused!
+; CHECK: da analyze - none!
+; CHECK: da analyze - confused!
+; CHECK: da analyze - output [* * * *]!
+  
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc33
+  %B.addr.014 = phi ptr [ %B.addr.1.lcssa, %for.inc33 ], [ %B, %for.cond1.preheader.preheader ]
+  %i.013 = phi i64 [ %inc34, %for.inc33 ], [ 0, %for.cond1.preheader.preheader ]
+  %cmp28 = icmp sgt i64 %n, 0
+  br i1 %cmp28, label %for.cond4.preheader.preheader, label %for.inc33
+
+for.cond4.preheader.preheader:                    ; preds = %for.cond1.preheader
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.cond4.preheader.preheader, %for.inc30
+  %B.addr.110 = phi ptr [ %B.addr.2.lcssa, %for.inc30 ], [ %B.addr.014, %for.cond4.preheader.preheader ]
+  %j.09 = phi i64 [ %inc31, %for.inc30 ], [ 0, %for.cond4.preheader.preheader ]
+  %cmp53 = icmp sgt i64 %n, 0
+  br i1 %cmp53, label %for.cond7.preheader.preheader, label %for.cond15.loopexit
+
+for.cond7.preheader.preheader:                    ; preds = %for.cond4.preheader
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.cond7.preheader.preheader, %for.inc12
+  %k.07 = phi i64 [ %inc13, %for.inc12 ], [ 0, %for.cond7.preheader.preheader ]
+  %cmp81 = icmp sgt i64 %n, 0
+  br i1 %cmp81, label %for.body9.preheader, label %for.inc12
+
+for.body9.preheader:                              ; preds = %for.cond7.preheader
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9.preheader, %for.body9
+  %l.02 = phi i64 [ %inc11, %for.body9 ], [ 0, %for.body9.preheader ]
+  %arrayidx12 = getelementptr inbounds [100 x [100 x [100 x i64]]], ptr %A, i64 %i.013, i64 %j.09, i64 %k.07, i64 %l.02
+  store i64 %i.013, ptr %arrayidx12, align 8
+  %inc11 = add nsw i64 %l.02, 1
+  %exitcond15 = icmp ne i64 %inc11, %n
+  br i1 %exitcond15, label %for.body9, label %for.inc12.loopexit
+
+for.inc12.loopexit:                               ; preds = %for.body9
+  br label %for.inc12
+
+for.inc12:                                        ; preds = %for.inc12.loopexit, %for.cond7.preheader
+  %inc13 = add nsw i64 %k.07, 1
+  %exitcond16 = icmp ne i64 %inc13, %n
+  br i1 %exitcond16, label %for.cond7.preheader, label %for.cond15.loopexit.loopexit
+
+for.cond15.loopexit.loopexit:                     ; preds = %for.inc12
+  br label %for.cond15.loopexit
+
+for.cond15.loopexit:                              ; preds = %for.cond15.loopexit.loopexit, %for.cond4.preheader
+  %cmp163 = icmp sgt i64 %n, 0
+  br i1 %cmp163, label %for.cond18.preheader.preheader, label %for.inc30
+
+for.cond18.preheader.preheader:                   ; preds = %for.cond15.loopexit
+  br label %for.cond18.preheader
+
+for.cond18.preheader:                             ; preds = %for.cond18.preheader.preheader, %for.inc27
+  %k14.06 = phi i64 [ %inc28, %for.inc27 ], [ 1, %for.cond18.preheader.preheader ]
+  %B.addr.25 = phi ptr [ %B.addr.3.lcssa, %for.inc27 ], [ %B.addr.110, %for.cond18.preheader.preheader ]
+  %cmp191 = icmp sgt i64 %n, 0
+  br i1 %cmp191, label %for.body20.preheader, label %for.inc27
+
+for.body20.preheader:                             ; preds = %for.cond18.preheader
+  br label %for.body20
+
+for.body20:                                       ; preds = %for.body20.preheader, %for.body20
+  %l17.04 = phi i64 [ %inc25, %for.body20 ], [ 0, %for.body20.preheader ]
+  %B.addr.34 = phi ptr [ %incdec.ptr, %for.body20 ], [ %B.addr.25, %for.body20.preheader ]
+  %add = add nsw i64 %l17.04, 1
+  %add21 = add nsw i64 %k14.06, 2
+  %add22 = add nsw i64 %j.09, 3
+  %add23 = add nsw i64 %i.013, 4
+  %arrayidx24 = getelementptr inbounds [100 x [100 x [100 x i64]]], ptr %A, i64 %add23, i64 %add22, i64 %add21, i64 %add
+  %0 = load i64, ptr %arrayidx24, align 8
+  %incdec.ptr = getelementptr inbounds i64, ptr %B.addr.34, i64 1
+  store i64 %0, ptr %B.addr.34, align 8
+  %inc25 = add nsw i64 %l17.04, 1
+  %exitcond = icmp ne i64 %inc25, %n
+  br i1 %exitcond, label %for.body20, label %for.inc27.loopexit
+
+for.inc27.loopexit:                               ; preds = %for.body20
+  %scevgep = getelementptr i64, ptr %B.addr.25, i64 %n
+  br label %for.inc27
+
+for.inc27:                                        ; preds = %for.inc27.loopexit, %for.cond18.preheader
+  %B.addr.3.lcssa = phi ptr [ %B.addr.25, %for.cond18.preheader ], [ %scevgep, %for.inc27.loopexit ]
+  %inc28 = add nsw i64 %k14.06, 1
+  %inc29 = add nsw i64 %n, 1
+  %exitcond17 = icmp ne i64 %inc28, %inc29
+  br i1 %exitcond17, label %for.cond18.preheader, label %for.inc30.loopexit
+
+for.inc30.loopexit:                               ; preds = %for.inc27
+  %B.addr.3.lcssa.lcssa = phi ptr [ %B.addr.3.lcssa, %for.inc27 ]
+  br label %for.inc30
+
+for.inc30:                                        ; preds = %for.inc30.loopexit, %for.cond15.loopexit
+  %B.addr.2.lcssa = phi ptr [ %B.addr.110, %for.cond15.loopexit ], [ %B.addr.3.lcssa.lcssa, %for.inc30.loopexit ]
+  %inc31 = add nsw i64 %j.09, 1
+  %exitcond18 = icmp ne i64 %inc31, %n
+  br i1 %exitcond18, label %for.cond4.preheader, label %for.inc33.loopexit
+
+for.inc33.loopexit:                               ; preds = %for.inc30
+  %B.addr.2.lcssa.lcssa = phi ptr [ %B.addr.2.lcssa, %for.inc30 ]
+  br label %for.inc33
+
+for.inc33:                                        ; preds = %for.inc33.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi ptr [ %B.addr.014, %for.cond1.preheader ], [ %B.addr.2.lcssa.lcssa, %for.inc33.loopexit ]
+  %inc34 = add nsw i64 %i.013, 1
+  %exitcond19 = icmp ne i64 %inc34, %n
+  br i1 %exitcond19, label %for.cond1.preheader, label %for.end35.loopexit
+
+for.end35.loopexit:                               ; preds = %for.inc33
+  br label %for.end35
+
+for.end35:                                        ; preds = %for.end35.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopFusion/backward_loop_carried.ll b/llvm/test/Transforms/LoopFusion/backward_loop_carried.ll
new file mode 100644
index 0000000000000..d9759f7840862
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/backward_loop_carried.ll
@@ -0,0 +1,185 @@
+; RUN: opt -S -passes=loop-fusion -da-disable-delinearization-checks < %s | FileCheck %s
+
+; The two inner loops have no dependency and are allowed to be fused as in the
+; outer loops, different levels are accessed to.
+
+; C Code
+;
+;;  for (long int i = 0; i < n; i++) {
+;;    for (long int j = 0; j < n; j++) {
+;;      for (long int k = 0; k < n; k++) {
+;;        A[i][j][k] = i;
+;;      }
+;;      for (long int k = 0; k < n; k++) {
+;;        temp = A[i + 3][j + 2][k + 1];
+
+define void @backward_dep0(i64 %n, ptr %A) nounwind uwtable ssp {
+entry:
+  %cmp10 = icmp sgt i64 %n, 0
+  br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26
+
+; CHECK-LABEL: backward_dep
+; CHECK-COUNT-1: for.body{{[0-9]+}}:
+; CHECK-NOT:     for.body{{[0-9]+}}:
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc24
+  %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ]
+  %cmp26 = icmp sgt i64 %n, 0
+  br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24
+
+for.cond4.preheader.preheader:                    ; preds = %for.cond1.preheader
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.cond4.preheader.preheader, %for.inc21
+  %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ]
+  %cmp51 = icmp sgt i64 %n, 0
+  br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit
+
+for.body6.preheader:                              ; preds = %for.cond4.preheader
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6.preheader, %for.body6
+  %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02
+  store i64 %i.011, ptr %arrayidx8, align 8
+  %inc = add nsw i64 %k.02, 1
+  %exitcond13 = icmp ne i64 %inc, %n
+  br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit
+
+for.cond10.loopexit.loopexit:                     ; preds = %for.body6
+  br label %for.cond10.loopexit
+
+for.cond10.loopexit:                              ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader
+  %cmp113 = icmp sgt i64 %n, 0
+  br i1 %cmp113, label %for.body12.preheader, label %for.inc21
+
+for.body12.preheader:                             ; preds = %for.cond10.loopexit
+  br label %for.body12
+
+for.body12:                                       ; preds = %for.body12.preheader, %for.body12
+  %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ]
+  %add = add nsw i64 %k9.05, 1
+  %add13 = add nsw i64 %j.07, 2
+  %add14 = add nsw i64 %i.011, 3
+  %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %add14, i64 %add13, i64 %add
+  %0 = load i64, ptr %arrayidx17, align 8
+  %inc19 = add nsw i64 %k9.05, 1
+  %exitcond = icmp ne i64 %inc19, %n
+  br i1 %exitcond, label %for.body12, label %for.inc21.loopexit
+
+for.inc21.loopexit:                               ; preds = %for.body12
+  br label %for.inc21
+
+for.inc21:                                        ; preds = %for.inc21.loopexit, %for.cond10.loopexit
+  %inc22 = add nsw i64 %j.07, 1
+  %exitcond14 = icmp ne i64 %inc22, %n
+  br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit
+
+for.inc24.loopexit:                               ; preds = %for.inc21
+  br label %for.inc24
+
+for.inc24:                                        ; preds = %for.inc24.loopexit, %for.cond1.preheader
+  %inc25 = add nsw i64 %i.011, 1
+  %exitcond15 = icmp ne i64 %inc25, %n
+  br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit
+
+for.end26.loopexit:                               ; preds = %for.inc24
+  br label %for.end26
+
+for.end26:                                        ; preds = %for.end26.loopexit, %entry
+  ret void
+}
+
+; The two inner loops have a backward loop-carried dependency, allowing them
+; to be fused.
+
+; C Code
+;
+;;  for (long int i = 0; i < n; i++) {
+;;    for (long int j = 0; j < n; j++) {
+;;      for (long int k = 0; k < n; k++) {
+;;        A[i][j][k] = i;
+;;      }
+;;      for (long int k = 0; k < n; k++) {
+;;        temp = A[i][j][k - 1];
+
+define void @backward_dep1(i64 %n, ptr %A) nounwind uwtable ssp {
+entry:
+  %cmp10 = icmp sgt i64 %n, 0
+  br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26
+
+; CHECK-LABEL: backward_dep
+; CHECK-COUNT-1: for.body{{[0-9]+}}:
+; CHECK-NOT:     for.body{{[0-9]+}}:
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc24
+  %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ]
+  %cmp26 = icmp sgt i64 %n, 0
+  br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24
+
+for.cond4.preheader.preheader:                    ; preds = %for.cond1.preheader
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.cond4.preheader.preheader, %for.inc21
+  %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ]
+  %cmp51 = icmp sgt i64 %n, 0
+  br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit
+
+for.body6.preheader:                              ; preds = %for.cond4.preheader
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6.preheader, %for.body6
+  %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02
+  store i64 %i.011, ptr %arrayidx8, align 8
+  %inc = add nsw i64 %k.02, 1
+  %exitcond13 = icmp ne i64 %inc, %n
+  br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit
+
+for.cond10.loopexit.loopexit:                     ; preds = %for.body6
+  br label %for.cond10.loopexit
+
+for.cond10.loopexit:                              ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader
+  %cmp113 = icmp sgt i64 %n, 0
+  br i1 %cmp113, label %for.body12.preheader, label %for.inc21
+
+for.body12.preheader:                             ; preds = %for.cond10.loopexit
+  br label %for.body12
+
+for.body12:                                       ; preds = %for.body12.preheader, %for.body12
+  %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ]
+  %add = add nsw i64 %k9.05, -1
+  %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %add
+  %0 = load i64, ptr %arrayidx17, align 8
+  %inc19 = add nsw i64 %k9.05, 1
+  %exitcond = icmp ne i64 %inc19, %n
+  br i1 %exitcond, label %for.body12, label %for.inc21.loopexit
+
+for.inc21.loopexit:                               ; preds = %for.body12
+  br label %for.inc21
+
+for.inc21:                                        ; preds = %for.inc21.loopexit, %for.cond10.loopexit
+  %inc22 = add nsw i64 %j.07, 1
+  %exitcond14 = icmp ne i64 %inc22, %n
+  br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit
+
+for.inc24.loopexit:                               ; preds = %for.inc21
+  br label %for.inc24
+
+for.inc24:                                        ; preds = %for.inc24.loopexit, %for.cond1.preheader
+  %inc25 = add nsw i64 %i.011, 1
+  %exitcond15 = icmp ne i64 %inc25, %n
+  br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit
+
+for.end26.loopexit:                               ; preds = %for.inc24
+  br label %for.end26
+
+for.end26:                                        ; preds = %for.end26.loopexit, %entry
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopFusion/simple.ll b/llvm/test/Transforms/LoopFusion/simple.ll
index d63890df14461..dfb3d13b56f04 100644
--- a/llvm/test/Transforms/LoopFusion/simple.ll
+++ b/llvm/test/Transforms/LoopFusion/simple.ll
@@ -300,40 +300,36 @@ bb23:                                             ; preds = %bb17, %bb
 
 define void @forward_dep(ptr noalias %arg) {
 ; CHECK-LABEL: @forward_dep(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label [[BB7:%.*]]
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    br label %[[BB7:.*]]
 ; CHECK:       bb7:
-; CHECK-NEXT:    [[DOT013:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB14:%.*]] ]
-; CHECK-NEXT:    [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], [[BB14]] ]
+; CHECK-NEXT:    [[DOT013:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP15:%.*]], %[[BB25:.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV22:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], %[[BB25]] ]
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[BB25]] ], [ 0, %[[BB]] ]
 ; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[DOT013]], -3
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV22]], 3
 ; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw i32 [[TMP]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[INDVARS_IV22]] to i32
 ; CHECK-NEXT:    [[TMP12:%.*]] = srem i32 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG:%.*]], i64 [[INDVARS_IV22]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV22]]
 ; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    br label [[BB14]]
+; CHECK-NEXT:    br label %[[BB14:.*]]
 ; CHECK:       bb14:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1
-; CHECK-NEXT:    [[TMP15]] = add nuw nsw i32 [[DOT013]], 1
-; CHECK-NEXT:    [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND4]], label [[BB7]], label [[BB19_PREHEADER:%.*]]
-; CHECK:       bb19.preheader:
-; CHECK-NEXT:    br label [[BB19:%.*]]
-; CHECK:       bb19:
-; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BB25:%.*]] ], [ 0, [[BB19_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP20:%.*]] = add nsw i64 [[INDVARS_IV1]], -3
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP23:%.*]] = mul nsw i32 [[TMP22]], 3
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV1]]
 ; CHECK-NEXT:    store i32 [[TMP23]], ptr [[TMP24]], align 4
-; CHECK-NEXT:    br label [[BB25]]
+; CHECK-NEXT:    br label %[[BB25]]
 ; CHECK:       bb25:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1
+; CHECK-NEXT:    [[TMP15]] = add nuw nsw i32 [[DOT013]], 1
+; CHECK-NEXT:    [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 100
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[BB19]], label [[BB26:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[BB7]], label %[[BB26:.*]]
 ; CHECK:       bb26:
 ; CHECK-NEXT:    ret void
 ;



More information about the llvm-commits mailing list