[llvm] [MachinePipeliner] Add validation for missed dependencies (PR #135148)

Mon Apr 21 04:55:46 PDT 2025

https://github.com/kasuga-fj updated https://github.com/llvm/llvm-project/pull/135148

>From 9ae49bbf6ebba3d2b13bab434d033e8ff81eb95b Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Tue, 26 Nov 2024 14:45:58 +0900
Subject: [PATCH 1/6] [MachinePipeliner] Add validation for missed dependencies

---
 llvm/include/llvm/CodeGen/MachinePipeliner.h  | 127 ++-
 llvm/lib/CodeGen/MachinePipeliner.cpp         | 799 +++++++++++++-----
 ...instruction-scheduled-at-correct-cycle.mir |   7 +-
 .../sms-loop-carried-fp-exceptions1.mir       | 107 +++
 .../sms-loop-carried-fp-exceptions2.mir       | 100 +++
 .../test/CodeGen/Hexagon/swp-carried-dep1.mir |  29 +-
 .../test/CodeGen/Hexagon/swp-carried-dep2.mir |  28 +-
 .../test/CodeGen/Hexagon/swp-carried-dep3.mir |  14 +-
 llvm/test/CodeGen/Hexagon/swp-epilog-phi9.ll  |   8 +-
 .../Hexagon/swp-loop-carried-order-dep1.mir   | 110 +++
 .../Hexagon/swp-loop-carried-order-dep2.mir   | 104 +++
 .../Hexagon/swp-loop-carried-order-dep3.mir   | 108 +++
 .../Hexagon/swp-loop-carried-order-dep4.mir   | 107 +++
 .../Hexagon/swp-loop-carried-order-dep5.mir   | 106 +++
 .../Hexagon/swp-loop-carried-order-dep6.mir   | 153 ++++
 .../Hexagon/swp-loop-carried-unknown.ll       |   2 +-
 llvm/test/CodeGen/Hexagon/swp-phi-start.ll    |   2 +-
 llvm/test/CodeGen/Hexagon/swp-resmii-1.ll     |   2 +-
 .../test/CodeGen/Hexagon/vect/vect-shuffle.ll |   5 +-
 .../CodeGen/PowerPC/sms-store-dependence.ll   |  49 +-
 .../Thumb2/pipeliner-preserve-ties.mir        |   1 -
 21 files changed, 1645 insertions(+), 323 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions1.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions2.mir
 create mode 100644 llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir
 create mode 100644 llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir
 create mode 100644 llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir
 create mode 100644 llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir
 create mode 100644 llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir
 create mode 100644 llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep6.mir

diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index fee6937e7d502..c8a5240447363 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -42,6 +42,7 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
@@ -120,14 +121,17 @@ class SwingSchedulerDDGEdge {
   SUnit *Dst = nullptr;
   SDep Pred;
   unsigned Distance = 0;
+  bool IsValidationOnly = false;
 
 public:
   /// Creates an edge corresponding to an edge represented by \p PredOrSucc and
   /// \p Dep in the original DAG. This pair has no information about the
   /// direction of the edge, so we need to pass an additional argument \p
   /// IsSucc.
-  SwingSchedulerDDGEdge(SUnit *PredOrSucc, const SDep &Dep, bool IsSucc)
-      : Dst(PredOrSucc), Pred(Dep), Distance(0u) {
+  SwingSchedulerDDGEdge(SUnit *PredOrSucc, const SDep &Dep, bool IsSucc,
+                        bool IsValidationOnly)
+      : Dst(PredOrSucc), Pred(Dep), Distance(0u),
+        IsValidationOnly(IsValidationOnly) {
     SUnit *Src = Dep.getSUnit();
 
     if (IsSucc) {
@@ -188,6 +192,50 @@ class SwingSchedulerDDGEdge {
   /// functions. We ignore the back-edge recurrence in order to avoid unbounded
   /// recursion in the calculation of the ASAP, ALAP, etc functions.
   bool ignoreDependence(bool IgnoreAnti) const;
+
+  bool isValidationOnly() const { return IsValidationOnly; }
+};
+
+struct LoopCarriedEdges {
+  using OutputDep = SmallDenseMap<Register, SmallSetVector<SUnit *, 4>>;
+  using OrderDep = SmallSetVector<SUnit *, 8>;
+  using OutputDepsType = DenseMap<SUnit *, OutputDep>;
+  using OrderDepsType = DenseMap<SUnit *, OrderDep>;
+
+  OutputDepsType OutputDeps;
+  OrderDepsType OrderDeps;
+
+private:
+  DenseMap<const SUnit *, SmallPtrSet<const SUnit *, 4>> BackEdges;
+
+public:
+  const OutputDep *getOutputDepOrNull(SUnit *Key) const {
+    auto Ite = OutputDeps.find(Key);
+    if (Ite == OutputDeps.end())
+      return nullptr;
+    return &Ite->second;
+  }
+
+  const OrderDep *getOrderDepOrNull(SUnit *Key) const {
+    auto Ite = OrderDeps.find(Key);
+    if (Ite == OrderDeps.end())
+      return nullptr;
+    return &Ite->second;
+  }
+
+  bool shouldAddBackEdge(const SUnit *From, const SUnit *To) const {
+    if (From->NodeNum < To->NodeNum)
+      return false;
+    auto Ite = BackEdges.find(From);
+    if (Ite == BackEdges.end())
+      return false;
+    return Ite->second.contains(To);
+  }
+
+  void modifySUnits(std::vector<SUnit> &SUnits);
+
+  void dump(SUnit *SU, const TargetRegisterInfo *TRI,
+            const MachineRegisterInfo *MRI) const;
 };
 
 /// Represents dependencies between instructions. This class is a wrapper of
@@ -196,11 +244,33 @@ class SwingSchedulerDDGEdge {
 /// anti-dependencies in the original DAG.
 /// FIXME: Support any other loop-carried dependencies
 class SwingSchedulerDDG {
-  using EdgesType = SmallVector<SwingSchedulerDDGEdge, 4>;
+  class EdgesType {
+    SmallVector<SwingSchedulerDDGEdge, 4> Underlying;
+    unsigned LoopCarriedOrderDepsCount = 0;
+
+  public:
+    void append(const SwingSchedulerDDGEdge &Edge) {
+      bool LoopCarriedOrderDep = Edge.isOrderDep() && Edge.getDistance() != 0;
+      assert(!(LoopCarriedOrderDepsCount != 0 && !LoopCarriedOrderDep) &&
+             "Loop-carried edges should not be added to the underlying edges");
+      Underlying.push_back(Edge);
+      if (LoopCarriedOrderDep)
+        ++LoopCarriedOrderDepsCount;
+    }
+
+    ArrayRef<SwingSchedulerDDGEdge> get(bool UseLoopCarriedEdges) const {
+      ArrayRef<SwingSchedulerDDGEdge> Res = Underlying;
+      if (!UseLoopCarriedEdges)
+        Res = Res.slice(0, Underlying.size() - LoopCarriedOrderDepsCount);
+      return Res;
+    }
+  };
 
   struct SwingSchedulerDDGEdges {
     EdgesType Preds;
     EdgesType Succs;
+
+    SmallVector<SwingSchedulerDDGEdge, 4> ValidationOnlyPreds;
   };
 
   void initEdges(SUnit *SU);
@@ -211,6 +281,7 @@ class SwingSchedulerDDG {
   std::vector<SwingSchedulerDDGEdges> EdgesVec;
   SwingSchedulerDDGEdges EntrySUEdges;
   SwingSchedulerDDGEdges ExitSUEdges;
+  bool UseLoopCarriedEdges = false;
 
   void addEdge(const SUnit *SU, const SwingSchedulerDDGEdge &Edge);
 
@@ -218,11 +289,18 @@ class SwingSchedulerDDG {
   const SwingSchedulerDDGEdges &getEdges(const SUnit *SU) const;
 
 public:
-  SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, SUnit *ExitSU);
+  SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, SUnit *ExitSU,
+                    const LoopCarriedEdges &LCE);
+
+  ArrayRef<SwingSchedulerDDGEdge> getInEdges(const SUnit *SU) const;
 
-  const EdgesType &getInEdges(const SUnit *SU) const;
+  ArrayRef<SwingSchedulerDDGEdge> getOutEdges(const SUnit *SU) const;
 
-  const EdgesType &getOutEdges(const SUnit *SU) const;
+  bool isValidSchedule(std::vector<SUnit> &SUnits,
+                       const SMSchedule &Schedule) const;
+
+  void applyLoopCarriedEdges() { UseLoopCarriedEdges = true; }
+  void removeLoopCarriedEdges() { UseLoopCarriedEdges = false; }
 };
 
 /// This class builds the dependence graph for the instructions in a loop,
@@ -279,7 +357,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
 
   /// Helper class to implement Johnson's circuit finding algorithm.
-  class Circuits {
+  struct Circuits {
     std::vector<SUnit> &SUnits;
     SetVector<SUnit *> Stack;
     BitVector Blocked;
@@ -288,7 +366,6 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
     // Node to Index from ScheduleDAGTopologicalSort
     std::vector<int> *Node2Idx;
     unsigned NumPaths = 0u;
-    static unsigned MaxPaths;
 
   public:
     Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo)
@@ -300,7 +377,6 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
     }
     Circuits &operator=(const Circuits &other) = delete;
     Circuits(const Circuits &other) = delete;
-    ~Circuits() { delete Node2Idx; }
 
     /// Reset the data structures used in the circuit algorithm.
     void reset() {
@@ -310,9 +386,9 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
       NumPaths = 0;
     }
 
-    void createAdjacencyStructure(SwingSchedulerDAG *DAG);
+    void createAdjacencyStructure(const SwingSchedulerDDG *DDG);
     bool circuit(int V, int S, NodeSetType &NodeSets,
-                 const SwingSchedulerDAG *DAG, bool HasBackedge = false);
+                 const SwingSchedulerDDG *DDG, bool HasBackedge = false);
     void unblock(int U);
   };
 
@@ -366,7 +442,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
     return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
   }
 
-  bool isLoopCarriedDep(const SwingSchedulerDDGEdge &Edge) const;
+  bool hasLoopCarriedMemDep(const MachineInstr *Src, const MachineInstr *Dst,
+                            BatchAAResults *BAA) const;
 
   void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
 
@@ -390,11 +467,11 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
 
   const SwingSchedulerDDG *getDDG() const { return DDG.get(); }
 
-  bool mayOverlapInLaterIter(const MachineInstr *BaseMI,
-                             const MachineInstr *OtherMI) const;
+  AliasResult::Kind mayOverlapInLaterIter(const MachineInstr *BaseMI,
+                                          const MachineInstr *OtherMI) const;
 
 private:
-  void addLoopCarriedDependences(AAResults *AA);
+  LoopCarriedEdges addLoopCarriedDependences(AAResults *AA);
   void updatePhiDependences();
   void changeDependences();
   unsigned calculateResMII();
@@ -440,7 +517,7 @@ class NodeSet {
   using iterator = SetVector<SUnit *>::const_iterator;
 
   NodeSet() = default;
-  NodeSet(iterator S, iterator E, const SwingSchedulerDAG *DAG)
+  NodeSet(iterator S, iterator E, const SwingSchedulerDDG *DDG)
       : Nodes(S, E), HasRecurrence(true) {
     // Calculate the latency of this node set.
     // Example to demonstrate the calculation:
@@ -456,7 +533,6 @@ class NodeSet {
     //
     // Hold a map from each SUnit in the circle to the maximum distance from the
     // source node by only considering the nodes.
-    const SwingSchedulerDDG *DDG = DAG->getDDG();
     DenseMap<SUnit *, unsigned> SUnitToDistance;
     for (auto *Node : Nodes)
       SUnitToDistance[Node] = 0;
@@ -474,23 +550,6 @@ class NodeSet {
           DV = DU + Succ.getLatency();
       }
     }
-    // Handle a back-edge in loop carried dependencies
-    SUnit *FirstNode = Nodes[0];
-    SUnit *LastNode = Nodes[Nodes.size() - 1];
-
-    for (auto &PI : DDG->getInEdges(LastNode)) {
-      // If we have an order dep that is potentially loop carried then a
-      // back-edge exists between the last node and the first node that isn't
-      // modeled in the DAG. Handle it manually by adding 1 to the distance of
-      // the last node.
-      if (PI.getSrc() != FirstNode || !PI.isOrderDep() ||
-          !DAG->isLoopCarriedDep(PI))
-        continue;
-      unsigned &First = SUnitToDistance[FirstNode];
-      unsigned Last = SUnitToDistance[LastNode];
-      First = std::max(First, Last + 1);
-    }
-
     // The latency is the distance from the source node to itself.
     Latency = SUnitToDistance[Nodes.front()];
   }
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 6cb0299a30d7a..675149a992a7d 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -194,6 +194,10 @@ static cl::opt<bool>
     MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
                cl::desc("Use the MVE code generator for software pipelining"));
 
+static cl::opt<unsigned> MaxCircuitPaths(
+    "pipeliner-max-circuit-paths", cl::Hidden, cl::init(5),
+    cl::desc("Maximum number of circles to be detected for each vertex"));
+
 namespace llvm {
 
 // A command line option to enable the CopyToPhi DAG mutation.
@@ -221,7 +225,6 @@ static cl::opt<WindowSchedulingFlag> WindowSchedulingOption(
 
 } // end namespace llvm
 
-unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
 char MachinePipeliner::ID = 0;
 #ifndef NDEBUG
 int MachinePipeliner::NumTries = 0;
@@ -562,17 +565,27 @@ void SwingSchedulerDAG::setMAX_II() {
 void SwingSchedulerDAG::schedule() {
   AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
   buildSchedGraph(AA);
-  addLoopCarriedDependences(AA);
+  auto LCE = addLoopCarriedDependences(AA);
+  LCE.modifySUnits(SUnits);
   updatePhiDependences();
   Topo.InitDAGTopologicalSorting();
   changeDependences();
   postProcessDAG();
-  DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU);
-  LLVM_DEBUG(dump());
 
+  LLVM_DEBUG({
+    dump();
+    dbgs() << "Loop Carried Edges:\n";
+    for (SUnit &SU : SUnits)
+      LCE.dump(&SU, TRI, &MRI);
+  });
+
+  DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);
+
+  DDG->applyLoopCarriedEdges();
   NodeSetType NodeSets;
   findCircuits(NodeSets);
   NodeSetType Circuits = NodeSets;
+  DDG->removeLoopCarriedEdges();
 
   // Calculate the MII.
   unsigned ResMII = calculateResMII();
@@ -651,6 +664,7 @@ void SwingSchedulerDAG::schedule() {
   // check for node order issues
   checkValidNodeOrder(Circuits);
 
+  DDG->applyLoopCarriedEdges();
   SMSchedule Schedule(Pass.MF, this);
   Scheduled = schedulePipeline(Schedule);
 
@@ -803,18 +817,16 @@ static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
 
 /// Return true if the instruction causes a chain between memory
 /// references before and after it.
-static bool isDependenceBarrier(MachineInstr &MI) {
-  return MI.isCall() || MI.mayRaiseFPException() ||
-         MI.hasUnmodeledSideEffects() ||
-         (MI.hasOrderedMemoryRef() &&
-          (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
+static bool isGlobalMemoryObject(MachineInstr &MI) {
+  return MI.isCall() || MI.hasUnmodeledSideEffects() ||
+         (MI.hasOrderedMemoryRef() && !MI.isDereferenceableInvariantLoad());
 }
 
 /// Return the underlying objects for the memory references of an instruction.
 /// This function calls the code in ValueTracking, but first checks that the
 /// instruction has a memory operand.
-static void getUnderlyingObjects(const MachineInstr *MI,
-                                 SmallVectorImpl<const Value *> &Objs) {
+static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
+                                         SmallVectorImpl<const Value *> &Objs) {
   if (!MI->hasOneMemOperand())
     return;
   MachineMemOperand *MM = *MI->memoperands_begin();
@@ -829,97 +841,63 @@ static void getUnderlyingObjects(const MachineInstr *MI,
   }
 }
 
-/// Add a chain edge between a load and store if the store can be an
-/// alias of the load on a subsequent iteration, i.e., a loop carried
-/// dependence. This code is very similar to the code in ScheduleDAGInstrs
-/// but that code doesn't create loop carried dependences.
-void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
-  MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;
-  Value *UnknownValue =
-    UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
-  for (auto &SU : SUnits) {
-    MachineInstr &MI = *SU.getInstr();
-    if (isDependenceBarrier(MI))
-      PendingLoads.clear();
-    else if (MI.mayLoad()) {
-      SmallVector<const Value *, 4> Objs;
-      ::getUnderlyingObjects(&MI, Objs);
-      if (Objs.empty())
-        Objs.push_back(UnknownValue);
-      for (const auto *V : Objs) {
-        SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
-        SUs.push_back(&SU);
-      }
-    } else if (MI.mayStore()) {
-      SmallVector<const Value *, 4> Objs;
-      ::getUnderlyingObjects(&MI, Objs);
-      if (Objs.empty())
-        Objs.push_back(UnknownValue);
-      for (const auto *V : Objs) {
-        MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
-            PendingLoads.find(V);
-        if (I == PendingLoads.end())
-          continue;
-        for (auto *Load : I->second) {
-          if (isSuccOrder(Load, &SU))
-            continue;
-          MachineInstr &LdMI = *Load->getInstr();
-          // First, perform the cheaper check that compares the base register.
-          // If they are the same and the load offset is less than the store
-          // offset, then mark the dependence as loop carried potentially.
-          const MachineOperand *BaseOp1, *BaseOp2;
-          int64_t Offset1, Offset2;
-          bool Offset1IsScalable, Offset2IsScalable;
-          if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
-                                           Offset1IsScalable, TRI) &&
-              TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
-                                           Offset2IsScalable, TRI)) {
-            if (BaseOp1->isIdenticalTo(*BaseOp2) &&
-                Offset1IsScalable == Offset2IsScalable &&
-                (int)Offset1 < (int)Offset2) {
-              assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
-                     "What happened to the chain edge?");
-              SDep Dep(Load, SDep::Barrier);
-              Dep.setLatency(1);
-              SU.addPred(Dep);
-              continue;
-            }
-          }
-          // Second, the more expensive check that uses alias analysis on the
-          // base registers. If they alias, and the load offset is less than
-          // the store offset, the mark the dependence as loop carried.
-          if (!AA) {
-            SDep Dep(Load, SDep::Barrier);
-            Dep.setLatency(1);
-            SU.addPred(Dep);
-            continue;
-          }
-          MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
-          MachineMemOperand *MMO2 = *MI.memoperands_begin();
-          if (!MMO1->getValue() || !MMO2->getValue()) {
-            SDep Dep(Load, SDep::Barrier);
-            Dep.setLatency(1);
-            SU.addPred(Dep);
-            continue;
-          }
-          if (MMO1->getValue() == MMO2->getValue() &&
-              MMO1->getOffset() <= MMO2->getOffset()) {
-            SDep Dep(Load, SDep::Barrier);
-            Dep.setLatency(1);
-            SU.addPred(Dep);
-            continue;
-          }
-          if (!AA->isNoAlias(
-                  MemoryLocation::getAfter(MMO1->getValue(), MMO1->getAAInfo()),
-                  MemoryLocation::getAfter(MMO2->getValue(),
-                                           MMO2->getAAInfo()))) {
-            SDep Dep(Load, SDep::Barrier);
-            Dep.setLatency(1);
-            SU.addPred(Dep);
-          }
-        }
-      }
-    }
+static std::optional<MemoryLocation>
+getMemoryLocationForAA(const MachineInstr *MI) {
+  const MachineMemOperand *MMO = *MI->memoperands_begin();
+  const Value *Val = MMO->getValue();
+  if (!Val)
+    return std::nullopt;
+  auto MemLoc = MemoryLocation::getBeforeOrAfter(Val, MMO->getAAInfo());
+
+  // Peel off noalias information from `AATags` because it might be valid only
+  // in single iteration.
+  // FIXME: This is too conservative. Checking
+  // `llvm.experimental.noalias.scope.decl` instrinsics in the original LLVM IR
+  // can perform more accuurately.
+  // MemLoc.AATags.NoAlias = nullptr;
+  return MemLoc;
+}
+
+/// Return true for an memory dependence that is loop carried
+/// potentially. A dependence is loop carried if the destination defines a value
+/// that may be used or defined by the source in a subsequent iteration.
+bool SwingSchedulerDAG::hasLoopCarriedMemDep(const MachineInstr *Src,
+                                             const MachineInstr *Dst,
+                                             BatchAAResults *BAA) const {
+  if (!SwpPruneLoopCarried)
+    return true;
+
+  // First, check the dependence by comparing base register, offset, and
+  // step value of the loop.
+  switch (mayOverlapInLaterIter(Src, Dst)) {
+  case AliasResult::Kind::MustAlias:
+    return true;
+  case AliasResult::Kind::NoAlias:
+    return false;
+  case AliasResult::Kind::MayAlias:
+    break;
+  default:
+    llvm_unreachable("Unexpected alias");
+  }
+
+  // If we cannot determine the dependence by previouse check, then
+  // check by using alias analysis.
+  if (!BAA)
+    return true;
+
+  const auto MemLoc1 = getMemoryLocationForAA(Src);
+  const auto MemLoc2 = getMemoryLocationForAA(Dst);
+  if (!MemLoc1.has_value() || !MemLoc2.has_value())
+    return true;
+  switch (BAA->alias(*MemLoc1, *MemLoc2)) {
+  case AliasResult::Kind::MayAlias:
+  case AliasResult::Kind::MustAlias:
+  case AliasResult::Kind::PartialAlias:
+    return true;
+  case AliasResult::Kind::NoAlias:
+    return false;
+  default:
+    llvm_unreachable("Unexpected alias");
   }
 }
 
@@ -1545,8 +1523,311 @@ class HighRegisterPressureDetector {
   }
 };
 
+/// Add loop-carried chain dependencies. This class handles the same type of
+/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
+/// account dependencies across iterations.
+class LoopCarriedOrderDepsTracker {
+  // Type of instruction that is relevant to order-dependencies
+  enum class InstrTag {
+    // Instruction related to global memory objects. There are order
+    // dependencies between instructions that may load or store or raise
+    // floating-point exception before and after this one.
+    GlobalMemoryObject = 0,
+
+    // Instruction that may load or store memory, but does not form a global
+    // barrier.
+    LoadOrStore = 1,
+
+    // Instruction that does not match above, but may raise floatin-point
+    // exceptions.
+    FPExceptions = 2,
+  };
+
+  struct TaggedSUnit : PointerIntPair<SUnit *, 2> {
+    TaggedSUnit(SUnit *SU, InstrTag Tag)
+        : PointerIntPair<SUnit *, 2>(SU, unsigned(Tag)) {}
+
+    InstrTag getTag() const { return InstrTag(getInt()); }
+  };
+
+  using SUsType = SmallVector<SUnit *, 4>;
+  using Value2SUs = MapVector<const Value *, SUsType>;
+
+  // Retains loads and stores classified by the underlying objects.
+  struct LoadStoreChunk {
+    Value2SUs Loads, Stores;
+    SUsType UnknownLoads, UnknownStores;
+  };
+
+  SwingSchedulerDAG *DAG;
+  std::unique_ptr<BatchAAResults> BAA;
+  const Value *UnknownValue;
+  std::vector<SUnit> &SUnits;
+
+  // The size of SUnits, for convenience.
+  const unsigned N;
+
+  // Adjacency matrix consisiting of order dependencies of the original DAG.
+  std::vector<BitVector> AdjMatrix;
+
+  // Loop-carried Edges.
+  std::vector<BitVector> LoopCarried;
+
+  // Instructions related to chain dependencies. They are one of the following.
+  //
+  //   1. Global memory object.
+  //   2. Load, but not a global memory object, not invariant, or may load trap
+  //      value.
+  //   3. Store, but not global memory object.
+  //   4. None of them, but may raise floating-point exceptions.
+  //
+  // This is used when analyzing loop-carried dependencies that access global
+  // barrier instructions.
+  std::vector<TaggedSUnit> TaggedSUnits;
+
+public:
+  LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, AAResults *AA)
+      : DAG(SSD), BAA(nullptr), SUnits(DAG->SUnits), N(SUnits.size()),
+        AdjMatrix(N, BitVector(N)), LoopCarried(N, BitVector(N)) {
+    UnknownValue =
+        UndefValue::get(Type::getVoidTy(DAG->MF.getFunction().getContext()));
+    if (AA) {
+      BAA = std::make_unique<BatchAAResults>(*AA);
+      BAA->enableCrossIterationMode();
+    }
+    initAdjMatrix();
+  }
+
+  void computeDependencies() {
+    // Traverse all instructions and extract only what we are targetting.
+    for (auto &SU : SUnits) {
+      auto Tagged = checkInstrType(&SU);
+
+      // This instruction has no loop-carried order-dependencies.
+      if (!Tagged)
+        continue;
+
+      TaggedSUnits.push_back(*Tagged);
+    }
+
+    addLoopCarriedDependencies();
+
+    // Finalize the results.
+    for (int I = 0; I != int(N); I++) {
+      // If the dependence between two instructions already exists in the
+      // original DAG, then loop-carried dependence of the same instructions is
+      // unnecessary because the original one expresses stricter
+      // constraint than loop-carried one.
+      LoopCarried[I].reset(AdjMatrix[I]);
+
+      // Self-loops are noisy.
+      LoopCarried[I].reset(I);
+    }
+  }
+
+  const BitVector &getLoopCarried(unsigned Idx) const {
+    return LoopCarried[Idx];
+  }
+
+private:
+  // Calculate reachability induced by the adjacency matrix. The original graph
+  // is DAG, so we can compute them from bottom to top.
+  void initAdjMatrix() {
+    for (int RI = 0; RI != int(N); RI++) {
+      int I = SUnits.size() - (RI + 1);
+      for (const auto &Succ : SUnits[I].Succs)
+        if (Succ.isNormalMemoryOrBarrier()) {
+          SUnit *SSU = Succ.getSUnit();
+          if (SSU->isBoundaryNode())
+            continue;
+          // `updatePhiDependences` may add barrier-dependencies between PHIs,
+          // which don't make sense in this case.
+          if (SSU->getInstr()->isPHI())
+            continue;
+          int J = SSU->NodeNum;
+          AdjMatrix[I].set(J);
+        }
+    }
+  }
+
+  // Tags to \p SU if the instruction may affect the order-dependencies.
+  std::optional<TaggedSUnit> checkInstrType(SUnit *SU) const {
+    MachineInstr *MI = SU->getInstr();
+    if (isGlobalMemoryObject(*MI))
+      return TaggedSUnit(SU, InstrTag::GlobalMemoryObject);
+
+    if (MI->mayStore() ||
+        (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
+      return TaggedSUnit(SU, InstrTag::LoadOrStore);
+
+    if (MI->mayRaiseFPException())
+      return TaggedSUnit(SU, InstrTag::FPExceptions);
+
+    return std::nullopt;
+  }
+
+  void addDependencesBetweenSUs(const SUsType &From, const SUsType &To) {
+    for (SUnit *SUa : From)
+      for (SUnit *SUb : To)
+        if (DAG->hasLoopCarriedMemDep(SUa->getInstr(), SUb->getInstr(),
+                                      BAA.get()))
+          LoopCarried[SUa->NodeNum].set(SUb->NodeNum);
+  }
+
+  void addDependenciesOfObj(const SUsType &From, const Value *Obj,
+                            const Value2SUs &To) {
+    auto *Ite = To.find(Obj);
+    if (Ite != To.end())
+      addDependencesBetweenSUs(From, Ite->second);
+  }
+
+  void addDependencesBetweenChunks(const LoadStoreChunk &From,
+                                   const LoadStoreChunk &To) {
+    // Add dependencies from store with known object
+    for (auto &[Obj, Stores] : From.Stores) {
+      addDependenciesOfObj(Stores, Obj, To.Stores);
+      addDependenciesOfObj(Stores, Obj, To.Loads);
+      addDependencesBetweenSUs(Stores, To.UnknownStores);
+      addDependencesBetweenSUs(Stores, To.UnknownLoads);
+    }
+
+    // Add dependencies from load with known object
+    for (auto &[Obj, Loads] : From.Loads) {
+      addDependenciesOfObj(Loads, Obj, To.Stores);
+      addDependencesBetweenSUs(Loads, To.UnknownStores);
+    }
+
+    // Add dependencies from load/store with unknown object
+    for ([[maybe_unused]] auto &[Obj, Stores] : To.Stores) {
+      addDependencesBetweenSUs(From.UnknownStores, Stores);
+      addDependencesBetweenSUs(From.UnknownLoads, Stores);
+    }
+    for ([[maybe_unused]] auto &[Obj, Loads] : To.Loads)
+      addDependencesBetweenSUs(From.UnknownStores, Loads);
+    addDependencesBetweenSUs(From.UnknownStores, To.UnknownStores);
+    addDependencesBetweenSUs(From.UnknownStores, To.UnknownLoads);
+    addDependencesBetweenSUs(From.UnknownLoads, To.UnknownStores);
+  }
+
+  void updateLoadStoreChunk(SUnit *SU, LoadStoreChunk &Chunk) {
+    const MachineInstr *MI = SU->getInstr();
+    if (!MI->mayLoadOrStore())
+      return;
+    SmallVector<const Value *, 4> Objs;
+    getUnderlyingObjectsForInstr(MI, Objs);
+    for (auto &Obj : Objs) {
+      if (Obj == UnknownValue) {
+        Objs.clear();
+        break;
+      }
+    }
+
+    if (Objs.empty()) {
+      (MI->mayStore() ? Chunk.UnknownStores : Chunk.UnknownLoads).push_back(SU);
+    } else {
+      auto &Map = (MI->mayStore() ? Chunk.Stores : Chunk.Loads);
+      for (const auto *Obj : Objs)
+        Map[Obj].push_back(SU);
+    }
+  }
+
+  void addLoopCarriedDependencies() {
+    // Collect instructions until a first instruction for global memory object
+    // is found
+    LoadStoreChunk FirstChunk;
+    std::vector<SUnit *> FirstSUs;
+    SUnit *FirstBarrier = nullptr;
+    for (const auto &TSU : TaggedSUnits) {
+      SUnit *SU = TSU.getPointer();
+      FirstSUs.push_back(SU);
+      if (TSU.getTag() == InstrTag::GlobalMemoryObject) {
+        FirstBarrier = SU;
+        break;
+      }
+      updateLoadStoreChunk(SU, FirstChunk);
+    }
+
+    // If there are no instructions related to global memory object, then check
+    // loop-carried dependencies for all load/store pairs.
+    if (FirstBarrier == nullptr) {
+      addDependencesBetweenChunks(FirstChunk, FirstChunk);
+      return;
+    }
+
+    // The instructions sequence is as follows.
+    //
+    // ```
+    // Some loads/stores/fp-exceptions (FirstSUs)
+    // Global memory object (FirstBarrier)
+    // ...
+    // Global memory object (LastBarrier)
+    // Some loads/stores/fp-exceptions (LastSUs)
+    // ```
+    //
+    // At this point, add the following loop-carried dependencies.
+    //
+    //   - From LastBarrier to FirstSUs and FirstBarrier
+    //   - From LastSUs to FirstBarrier
+    //   - From loads/stores in LastSUs to loads/stores in FirstSUs
+    //     if they can overlap
+    //
+    // Other loop-carried dependencies, such as LastSUs to load/store between
+    // FirstBarrier and LastBarrier, are implied by the above and existing
+    // dependencies, so we don't add them explicitly.
+    LoadStoreChunk LastChunk;
+    std::vector<SUnit *> LastSUs;
+    SUnit *LastBarrier = nullptr;
+    for (const auto &TSU : reverse(TaggedSUnits)) {
+      SUnit *SU = TSU.getPointer();
+      LastSUs.push_back(SU);
+      if (TSU.getTag() == InstrTag::GlobalMemoryObject) {
+        LastBarrier = SU;
+        break;
+      }
+      updateLoadStoreChunk(SU, LastChunk);
+    }
+
+    for (SUnit *SU : FirstSUs)
+      LoopCarried[LastBarrier->NodeNum].set(SU->NodeNum);
+    for (SUnit *SU : LastSUs)
+      LoopCarried[SU->NodeNum].set(FirstBarrier->NodeNum);
+    LoopCarried[FirstBarrier->NodeNum].reset(LastBarrier->NodeNum);
+    addDependencesBetweenChunks(LastChunk, FirstChunk);
+  }
+};
+
 } // end anonymous namespace
 
+/// Add dependencies across iterations.
+LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences(AAResults *AA) {
+  LoopCarriedEdges LCE;
+  const unsigned N = SUnits.size();
+
+  // Add loop-carried output-dependencies
+  for (SUnit &SU : SUnits) {
+    for (const auto &Pred : SU.Preds) {
+      if (Pred.getKind() != SDep::Output)
+        continue;
+      SUnit *PredSU = Pred.getSUnit();
+      if (PredSU->isBoundaryNode())
+        continue;
+      Register Reg = Pred.getReg();
+      for (const auto &E : LCE.OutputDeps[PredSU][Reg])
+        LCE.OutputDeps[&SU][Reg].insert(E);
+      LCE.OutputDeps[&SU][Reg].insert(PredSU);
+    }
+  }
+
+  // Add loop-carried order-dependencies
+  LoopCarriedOrderDepsTracker LCODTracker(this, AA);
+  LCODTracker.computeDependencies();
+  for (int I = 0; I != int(N); I++)
+    for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
+      LCE.OrderDeps[&SUnits[I]].insert(&SUnits[Succ]);
+
+  return LCE;
+}
+
 /// Calculate the resource constrained minimum initiation interval for the
 /// specified loop. We use the DFA to model the resources needed for
 /// each instruction, and we ignore dependences. A different DFA is created
@@ -1587,25 +1868,12 @@ unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
 
 /// Create the adjacency structure of the nodes in the graph.
 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
-    SwingSchedulerDAG *DAG) {
+    const SwingSchedulerDDG *DDG) {
   BitVector Added(SUnits.size());
-  DenseMap<int, int> OutputDeps;
-  for (int i = 0, e = SUnits.size(); i != e; ++i) {
+  for (int I = 0, E = SUnits.size(); I != E; ++I) {
     Added.reset();
     // Add any successor to the adjacency matrix and exclude duplicates.
-    for (auto &OE : DAG->DDG->getOutEdges(&SUnits[i])) {
-      // Only create a back-edge on the first and last nodes of a dependence
-      // chain. This records any chains and adds them later.
-      if (OE.isOutputDep()) {
-        int N = OE.getDst()->NodeNum;
-        int BackEdge = i;
-        auto Dep = OutputDeps.find(BackEdge);
-        if (Dep != OutputDeps.end()) {
-          BackEdge = Dep->second;
-          OutputDeps.erase(Dep);
-        }
-        OutputDeps[N] = BackEdge;
-      }
+    for (const auto &OE : DDG->getOutEdges(&SUnits[I])) {
       // Do not process a boundary node, an artificial node.
       if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
         continue;
@@ -1619,40 +1887,29 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
       if (OE.isAntiDep())
         continue;
 
+      // Ignore store-store dependencies when finding circuits for historical
+      // reasons. Adding these edges causes regressions in some important cases.
+      // FIXME: This could lead to an inaccurate estimation of RecMII. By
+      // improving the heuristics after circuit detection, this may not be
+      // necessary.
+      if (OE.isOrderDep() && OE.getSrc()->getInstr()->mayStore() &&
+          OE.getDst()->getInstr()->mayStore())
+        continue;
+
       int N = OE.getDst()->NodeNum;
+
       if (!Added.test(N)) {
-        AdjK[i].push_back(N);
+        AdjK[I].push_back(N);
         Added.set(N);
       }
     }
-    // A chain edge between a store and a load is treated as a back-edge in the
-    // adjacency matrix.
-    for (auto &IE : DAG->DDG->getInEdges(&SUnits[i])) {
-      SUnit *Src = IE.getSrc();
-      SUnit *Dst = IE.getDst();
-      if (!Dst->getInstr()->mayStore() || !DAG->isLoopCarriedDep(IE))
-        continue;
-      if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
-        int N = Src->NodeNum;
-        if (!Added.test(N)) {
-          AdjK[i].push_back(N);
-          Added.set(N);
-        }
-      }
-    }
   }
-  // Add back-edges in the adjacency matrix for the output dependences.
-  for (auto &OD : OutputDeps)
-    if (!Added.test(OD.second)) {
-      AdjK[OD.first].push_back(OD.second);
-      Added.set(OD.second);
-    }
 }
 
 /// Identify an elementary circuit in the dependence graph starting at the
 /// specified node.
 bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
-                                          const SwingSchedulerDAG *DAG,
+                                          const SwingSchedulerDDG *DDG,
                                           bool HasBackedge) {
   SUnit *SV = &SUnits[V];
   bool F = false;
@@ -1660,19 +1917,19 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
   Blocked.set(V);
 
   for (auto W : AdjK[V]) {
-    if (NumPaths > MaxPaths)
+    if (NumPaths > MaxCircuitPaths)
       break;
     if (W < S)
       continue;
     if (W == S) {
       if (!HasBackedge)
-        NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DAG));
+        NodeSets.push_back(NodeSet(Stack.begin(), Stack.end(), DDG));
       F = true;
       ++NumPaths;
       break;
     }
     if (!Blocked.test(W)) {
-      if (circuit(W, S, NodeSets, DAG,
+      if (circuit(W, S, NodeSets, DDG,
                   Node2Idx->at(W) < Node2Idx->at(V) ? true : HasBackedge))
         F = true;
     }
@@ -1710,10 +1967,10 @@ void SwingSchedulerDAG::Circuits::unblock(int U) {
 void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
   Circuits Cir(SUnits, Topo);
   // Create the adjacency structure.
-  Cir.createAdjacencyStructure(this);
+  Cir.createAdjacencyStructure(getDDG());
   for (int I = 0, E = SUnits.size(); I != E; ++I) {
     Cir.reset();
-    Cir.circuit(I, I, NodeSets, this);
+    Cir.circuit(I, I, NodeSets, getDDG());
   }
 }
 
@@ -2479,6 +2736,9 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
       });
     } while (++NI != NE && scheduleFound);
 
+    if (scheduleFound)
+      scheduleFound = DDG->isValidSchedule(SUnits, Schedule);
+
     // If a schedule is found, ensure non-pipelined instructions are in stage 0
     if (scheduleFound)
       scheduleFound =
@@ -2753,12 +3013,19 @@ MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
 
 /// Return false if there is no overlap between the region accessed by BaseMI in
 /// an iteration and the region accessed by OtherMI in subsequent iterations.
-bool SwingSchedulerDAG::mayOverlapInLaterIter(
-    const MachineInstr *BaseMI, const MachineInstr *OtherMI) const {
+AliasResult::Kind
+SwingSchedulerDAG::mayOverlapInLaterIter(const MachineInstr *BaseMI,
+                                         const MachineInstr *OtherMI) const {
+  if (!BaseMI->mayLoadOrStore() || !OtherMI->mayLoadOrStore())
+    return AliasResult::Kind::NoAlias;
+
+  // The conservative assumption is that a dependence between memory operations
+  // may be loop carried. The following code checks when it can be proved that
+  // there is no loop carried dependence.
   int DeltaB, DeltaO, Delta;
   if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||
       DeltaB != DeltaO)
-    return true;
+    return AliasResult::MayAlias;
   Delta = DeltaB;
 
   const MachineOperand *BaseOpB, *BaseOpO;
@@ -2769,25 +3036,25 @@ bool SwingSchedulerDAG::mayOverlapInLaterIter(
                                     OffsetBIsScalable, TRI) ||
       !TII->getMemOperandWithOffset(*OtherMI, BaseOpO, OffsetO,
                                     OffsetOIsScalable, TRI))
-    return true;
+    return AliasResult::MayAlias;
 
   if (OffsetBIsScalable || OffsetOIsScalable)
-    return true;
+    return AliasResult::MayAlias;
 
   if (!BaseOpB->isIdenticalTo(*BaseOpO)) {
     // Pass cases with different base operands but same initial values.
     // Typically for when pre/post increment is used.
 
     if (!BaseOpB->isReg() || !BaseOpO->isReg())
-      return true;
+      return AliasResult::MayAlias;
     Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
     if (!RegB.isVirtual() || !RegO.isVirtual())
-      return true;
+      return AliasResult::MayAlias;
 
     MachineInstr *DefB = MRI.getVRegDef(BaseOpB->getReg());
     MachineInstr *DefO = MRI.getVRegDef(BaseOpO->getReg());
     if (!DefB || !DefO || !DefB->isPHI() || !DefO->isPHI())
-      return true;
+      return AliasResult::MayAlias;
 
     Register InitValB;
     Register LoopValB;
@@ -2799,7 +3066,7 @@ bool SwingSchedulerDAG::mayOverlapInLaterIter(
     MachineInstr *InitDefO = MRI.getVRegDef(InitValO);
 
     if (!InitDefB->isIdenticalTo(*InitDefO))
-      return true;
+      return AliasResult::MayAlias;
   }
 
   LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
@@ -2808,7 +3075,7 @@ bool SwingSchedulerDAG::mayOverlapInLaterIter(
   // This is the main test, which checks the offset values and the loop
   // increment value to determine if the accesses may be loop carried.
   if (!AccessSizeB.hasValue() || !AccessSizeO.hasValue())
-    return true;
+    return AliasResult::MayAlias;
 
   LLVM_DEBUG({
     dbgs() << "Overlap check:\n";
@@ -2831,52 +3098,18 @@ bool SwingSchedulerDAG::mayOverlapInLaterIter(
     int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - 1;
     if (BaseMinAddr > OhterNextIterMaxAddr) {
       LLVM_DEBUG(dbgs() << "  Result: No overlap\n");
-      return false;
+      return AliasResult::NoAlias;
     }
   } else {
     int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - 1;
     int64_t OtherNextIterMinAddr = OffsetO + Delta;
     if (BaseMaxAddr < OtherNextIterMinAddr) {
       LLVM_DEBUG(dbgs() << "  Result: No overlap\n");
-      return false;
+      return AliasResult::NoAlias;
     }
   }
   LLVM_DEBUG(dbgs() << "  Result: Overlap\n");
-  return true;
-}
-
-/// Return true for an order or output dependence that is loop carried
-/// potentially. A dependence is loop carried if the destination defines a value
-/// that may be used or defined by the source in a subsequent iteration.
-bool SwingSchedulerDAG::isLoopCarriedDep(
-    const SwingSchedulerDDGEdge &Edge) const {
-  if ((!Edge.isOrderDep() && !Edge.isOutputDep()) || Edge.isArtificial() ||
-      Edge.getDst()->isBoundaryNode())
-    return false;
-
-  if (!SwpPruneLoopCarried)
-    return true;
-
-  if (Edge.isOutputDep())
-    return true;
-
-  MachineInstr *SI = Edge.getSrc()->getInstr();
-  MachineInstr *DI = Edge.getDst()->getInstr();
-  assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
-
-  // Assume ordered loads and stores may have a loop carried dependence.
-  if (SI->hasUnmodeledSideEffects() || DI->hasUnmodeledSideEffects() ||
-      SI->mayRaiseFPException() || DI->mayRaiseFPException() ||
-      SI->hasOrderedMemoryRef() || DI->hasOrderedMemoryRef())
-    return true;
-
-  if (!DI->mayLoadOrStore() || !SI->mayLoadOrStore())
-    return false;
-
-  // The conservative assumption is that a dependence between memory operations
-  // may be loop carried. The following code checks when it can be proved that
-  // there is no loop carried dependence.
-  return mayOverlapInLaterIter(DI, SI);
+  return AliasResult::MustAlias;
 }
 
 void SwingSchedulerDAG::postProcessDAG() {
@@ -3001,9 +3234,7 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
     for (SUnit *I : getInstructions(cycle)) {
       for (const auto &IE : DDG->getInEdges(SU)) {
         if (IE.getSrc() == I) {
-          // FIXME: Add reverse edge to `DDG` instead of calling
-          // `isLoopCarriedDep`
-          if (DAG->isLoopCarriedDep(IE)) {
+          if (IE.getDistance() != 0 && IE.isOrderDep()) {
             int End = earliestCycleInChain(IE, DDG) + (II - 1);
             *MinLateStart = std::min(*MinLateStart, End);
           }
@@ -3014,9 +3245,7 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
 
       for (const auto &OE : DDG->getOutEdges(SU)) {
         if (OE.getDst() == I) {
-          // FIXME: Add reverse edge to `DDG` instead of calling
-          // `isLoopCarriedDep`
-          if (DAG->isLoopCarriedDep(OE)) {
+          if (OE.getDistance() != 0 && OE.isOrderDep()) {
             int Start = latestCycleInChain(OE, DDG) + 1 - II;
             *MaxEarlyStart = std::max(*MaxEarlyStart, Start);
           }
@@ -3110,7 +3339,8 @@ void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,
     for (auto &OE : DDG->getOutEdges(SU)) {
       if (OE.getDst() != *I)
         continue;
-      if (OE.isOrderDep() && stageScheduled(*I) == StageInst1) {
+      if (OE.isOrderDep() && OE.getDistance() == 0 &&
+          stageScheduled(*I) == StageInst1) {
         OrderBeforeUse = true;
         if (Pos < MoveUse)
           MoveUse = Pos;
@@ -3118,7 +3348,8 @@ void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,
       // We did not handle HW dependences in previous for loop,
       // and we normally set Latency = 0 for Anti/Output deps,
       // so may have nodes in same cycle with Anti/Output dependent on HW regs.
-      else if ((OE.isAntiDep() || OE.isOutputDep()) &&
+      else if ((OE.isAntiDep() ||
+                (OE.isOutputDep() && OE.getDistance() == 0)) &&
                stageScheduled(*I) == StageInst1) {
         OrderBeforeUse = true;
         if ((MoveUse == 0) || (Pos < MoveUse))
@@ -3129,7 +3360,7 @@ void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,
       if (IE.getSrc() != *I)
         continue;
       if ((IE.isAntiDep() || IE.isOutputDep() || IE.isOrderDep()) &&
-          stageScheduled(*I) == StageInst1) {
+          IE.getDistance() == 0 && stageScheduled(*I) == StageInst1) {
         OrderAfterDef = true;
         MoveDef = Pos;
       }
@@ -3224,9 +3455,12 @@ bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD,
 /// dependencies.
 bool SMSchedule::onlyHasLoopCarriedOutputOrOrderPreds(
     SUnit *SU, const SwingSchedulerDDG *DDG) const {
-  for (const auto &IE : DDG->getInEdges(SU))
+  for (const auto &IE : DDG->getInEdges(SU)) {
+    if (IE.getDistance() != 0 && !IE.getDst()->getInstr()->isPHI())
+      continue;
     if (InstrToCycle.count(IE.getSrc()))
       return false;
+  }
   return true;
 }
 
@@ -3367,6 +3601,10 @@ bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
 /// The reason is that although an invalid node order may prevent
 /// the pipeliner from finding a pipelined schedule for arbitrary II,
 /// it does not lead to the generation of incorrect code.
+/// FIXME: Currently, we don't search all circuits. There is an upper limit to
+/// the number of circuits that can be searched. Also, there may be some that
+/// are pruned by heuristics. Therefore, this function may generate false
+/// positives.
 void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
 
   // a sorted vector that maps each SUnit to its index in the NodeOrder
@@ -3922,26 +4160,32 @@ SwingSchedulerDDG::getEdges(const SUnit *SU) const {
 void SwingSchedulerDDG::addEdge(const SUnit *SU,
                                 const SwingSchedulerDDGEdge &Edge) {
   auto &Edges = getEdges(SU);
-  if (Edge.getSrc() == SU)
-    Edges.Succs.push_back(Edge);
-  else
-    Edges.Preds.push_back(Edge);
+  if (Edge.isValidationOnly()) {
+    assert(SU == Edge.getDst() && "Validation only edges must be added to the "
+                                  "destination node");
+    Edges.ValidationOnlyPreds.push_back(Edge);
+  } else {
+    if (Edge.getSrc() == SU)
+      Edges.Succs.append(Edge);
+    else
+      Edges.Preds.append(Edge);
+  }
 }
 
 void SwingSchedulerDDG::initEdges(SUnit *SU) {
   for (const auto &PI : SU->Preds) {
-    SwingSchedulerDDGEdge Edge(SU, PI, false);
+    SwingSchedulerDDGEdge Edge(SU, PI, false, false);
     addEdge(SU, Edge);
   }
 
   for (const auto &SI : SU->Succs) {
-    SwingSchedulerDDGEdge Edge(SU, SI, true);
+    SwingSchedulerDDGEdge Edge(SU, SI, true, false);
     addEdge(SU, Edge);
   }
 }
 
 SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
-                                     SUnit *ExitSU)
+                                     SUnit *ExitSU, const LoopCarriedEdges &LCE)
     : EntrySU(EntrySU), ExitSU(ExitSU) {
   EdgesVec.resize(SUnits.size());
 
@@ -3949,14 +4193,141 @@ SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
   initEdges(ExitSU);
   for (auto &SU : SUnits)
     initEdges(&SU);
+
+  for (SUnit &SU : SUnits) {
+    SUnit *Src = &SU;
+
+    if (auto *OutputDep = LCE.getOutputDepOrNull(Src))
+      for (const auto &[Reg, Set] : *OutputDep) {
+        SDep Dep(Src, SDep::Output, Reg);
+        Dep.setLatency(1);
+        for (SUnit *Dst : Set) {
+          SwingSchedulerDDGEdge Edge(Dst, Dep, false, true);
+          Edge.setDistance(1);
+          // addEdge(Src, Edge);
+          addEdge(Dst, Edge);
+        }
+      }
+
+    if (auto *OrderDep = LCE.getOrderDepOrNull(Src)) {
+      SDep Dep(Src, SDep::Barrier);
+      Dep.setLatency(1);
+      for (SUnit *Dst : *OrderDep) {
+        SwingSchedulerDDGEdge Edge(Dst, Dep, false,
+                                   !LCE.shouldAddBackEdge(Src, Dst));
+        Edge.setDistance(1);
+        if (!Edge.isValidationOnly())
+          addEdge(Src, Edge);
+        addEdge(Dst, Edge);
+      }
+    }
+  }
 }
 
-const SwingSchedulerDDG::EdgesType &
+static bool shouldUseInScheduling(SUnit *DstSU, SDep &Pred) {
+  SUnit *SrcSU = Pred.getSUnit();
+  assert(SrcSU->NodeNum < DstSU->NodeNum && "Invalid order");
+  MachineInstr *SrcMI = SrcSU->getInstr();
+  MachineInstr *DstMI = DstSU->getInstr();
+  return SrcMI->mayLoad() && !DstMI->mayLoad() && DstMI->mayStore() &&
+         !isGlobalMemoryObject(*SrcMI) && !isGlobalMemoryObject(*DstMI) &&
+         !isSuccOrder(SrcSU, DstSU);
+}
+
+ArrayRef<SwingSchedulerDDGEdge>
 SwingSchedulerDDG::getInEdges(const SUnit *SU) const {
-  return getEdges(SU).Preds;
+  return getEdges(SU).Preds.get(UseLoopCarriedEdges);
 }
 
-const SwingSchedulerDDG::EdgesType &
+ArrayRef<SwingSchedulerDDGEdge>
 SwingSchedulerDDG::getOutEdges(const SUnit *SU) const {
-  return getEdges(SU).Succs;
+  return getEdges(SU).Succs.get(UseLoopCarriedEdges);
+}
+
+bool SwingSchedulerDDG::isValidSchedule(std::vector<SUnit> &SUnits,
+                                        const SMSchedule &Schedule) const {
+  unsigned II = Schedule.getInitiationInterval();
+
+  auto ExpandedCycle = [&](SUnit *SU) {
+    int Stage = Schedule.stageScheduled(SU);
+    int Cycle = Schedule.cycleScheduled(SU);
+    return Cycle + (Stage * II);
+  };
+
+  for (SUnit &Dst : SUnits) {
+    if (!Dst.isInstr())
+      continue;
+    int CycleDst = ExpandedCycle(&Dst);
+    for (const SwingSchedulerDDGEdge &Edge :
+         getEdges(&Dst).ValidationOnlyPreds) {
+      SUnit *Src = Edge.getSrc();
+      if (!Src->isInstr())
+        continue;
+      int CycleSrc = ExpandedCycle(Src);
+      int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();
+      if (CycleSrc > MaxLateStart) {
+        LLVM_DEBUG({
+          dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "
+                 << Dst.NodeNum << "\n";
+        });
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits) {
+  for (SUnit &SU : SUnits) {
+    SUnit *Src = &SU;
+    if (auto *OrderDep = getOrderDepOrNull(Src)) {
+      SDep Dep(Src, SDep::Barrier);
+      Dep.setLatency(1);
+      for (SUnit *Dst : *OrderDep) {
+        SUnit *From = Src;
+        SUnit *To = Dst;
+        bool IsBackEdge = Src->NodeNum > Dst->NodeNum;
+        if (IsBackEdge) {
+          BackEdges[From].insert(To);
+          std::swap(From, To);
+        }
+        SDep Pred = Dep;
+        Pred.setSUnit(From);
+        if (shouldUseInScheduling(To, Pred))
+          To->addPred(Pred);
+      }
+    }
+  }
+}
+
+void LoopCarriedEdges::dump(SUnit *SU, const TargetRegisterInfo *TRI,
+                            const MachineRegisterInfo *MRI) const {
+  const auto *Output = getOutputDepOrNull(SU);
+  const auto *Order = getOrderDepOrNull(SU);
+
+  if (!Output && !Order)
+    return;
+
+  const auto DumpSU = [](const SUnit *SU) {
+    std::ostringstream OSS;
+    OSS << "SU(" << SU->NodeNum << ")";
+    return OSS.str();
+  };
+
+  dbgs() << "  Loop carried edges from " << DumpSU(SU) << "\n";
+
+  if (Output) {
+    dbgs() << "    Output\n";
+    for (const auto &[Reg, Set] : *Output) {
+      const auto PReg = printReg(Reg, TRI, 0, MRI);
+      for (SUnit *Dst : Set)
+        dbgs() << "      " << DumpSU(Dst) << " Reg=" << PReg << "\n";
+    }
+  }
+
+  if (Order) {
+    dbgs() << "    Order\n";
+    for (SUnit *Dst : *Order)
+      dbgs() << "      " << DumpSU(Dst) << "\n";
+  }
 }
diff --git a/llvm/test/CodeGen/AArch64/sms-instruction-scheduled-at-correct-cycle.mir b/llvm/test/CodeGen/AArch64/sms-instruction-scheduled-at-correct-cycle.mir
index c1014b296cad3..2e7f72241f0cb 100644
--- a/llvm/test/CodeGen/AArch64/sms-instruction-scheduled-at-correct-cycle.mir
+++ b/llvm/test/CodeGen/AArch64/sms-instruction-scheduled-at-correct-cycle.mir
@@ -1,7 +1,12 @@
 # RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner -pipeliner-max-stages=50 -pipeliner-max-mii=50 -pipeliner-enable-copytophi=0 -pipeliner-ii-search-range=30 2>&1 | FileCheck %s
 # REQUIRES: asserts
 
-# Test that each instruction must be scheduled between the early cycle and the late cycle. Previously there were cases where an instruction is scheduled outside of the valid range. See issue #93936 for details.
+# This test is strongly depends on the process of the scheduling and too fragile.
+# XFAIL: *
+
+# Test that each instruction must be scheduled between the early cycle and the late cycle.
+# Previously there were cases where an instruction is scheduled outside of the valid range.
+# See issue #93936 for details.
 
 # CHECK: {{^ *}}Try to schedule with 47
 # CHECK: {{^ *}}Inst (11)   %48:fpr128 = LDRQui %35:gpr64sp, 0 :: (load (s128) from %ir.lsr.iv63, align 4, !tbaa !0)
diff --git a/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions1.mir b/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions1.mir
new file mode 100644
index 0000000000000..089006e4dedc6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions1.mir
@@ -0,0 +1,107 @@
+# RUN: llc -mtriple=aarch64 -run-pass=pipeliner -debug-only=pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg %s -o /dev/null 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Test a case where fenv is enabled and there are an instruction forming a
+# barrier. The order between the instruction and instructions that may raise
+# exceptions must not be changed.
+
+# CHECK: Loop Carried Edges:
+# CHECK-NEXT:   Loop carried edges from SU(7)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(2)
+# CHECK-NEXT:       SU(3)
+# CHECK-NEXT:       SU(4)
+# CHECK-NEXT:       SU(5)
+# CHECK-NEXT: calculateResMII:
+
+--- |
+  @x = dso_local global i32 0, align 4
+  
+  define dso_local void @f(ptr nocapture noundef writeonly %a, float noundef %y, i32 noundef %n) {
+  entry:
+    %cmp6 = icmp sgt i32 %n, 0
+    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+  
+  for.body.preheader:
+    %wide.trip.count = zext nneg i32 %n to i64
+    br label %for.body
+  
+  for.cond.cleanup:
+    ret void
+  
+  for.body:
+    %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+    %tmp9 = trunc i64 %indvars.iv to i32
+    %conv = tail call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %tmp9, metadata !"round.dynamic", metadata !"fpexcept.strict") #2
+    %add = tail call float @llvm.experimental.constrained.fadd.f32(float %conv, float %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #2
+    %0 = shl nuw nsw i64 %indvars.iv, 2
+    %scevgep = getelementptr i8, ptr %a, i64 %0
+    store float %add, ptr %scevgep, align 4, !tbaa !6
+    %1 = load volatile i32, ptr @x, align 4, !tbaa !10
+    %2 = zext i32 %1 to i64
+    %3 = add i64 %indvars.iv, %2
+    %tmp = trunc i64 %3 to i32
+    store volatile i32 %tmp, ptr @x, align 4, !tbaa !10
+    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+    %exitcond.not = icmp eq i64 %wide.trip.count, %indvars.iv.next
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+  
+  declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata)
+  
+  declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+  
+  attributes #2 = { strictfp }
+  
+  !6 = !{!7, !7, i64 0}
+  !7 = !{!"float", !8, i64 0}
+  !8 = !{!"omnipotent char", !9, i64 0}
+  !9 = !{!"Simple C/C++ TBAA"}
+  !10 = !{!11, !11, i64 0}
+  !11 = !{!"int", !8, i64 0}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $x0, $s0, $w1
+  
+    %5:gpr32common = COPY $w1
+    %4:fpr32 = COPY $s0
+    %3:gpr64common = COPY $x0
+    dead $wzr = SUBSWri %5, 1, 0, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit $nzcv
+    B %bb.1
+  
+  bb.1.for.body.preheader:
+    %8:gpr32 = ORRWrs $wzr, %5, 0
+    %0:gpr64 = SUBREG_TO_REG 0, killed %8, %subreg.sub_32
+    %9:gpr64all = COPY $xzr
+    %7:gpr64all = COPY %9
+    %13:gpr64common = ADRP target-flags(aarch64-page) @x
+    B %bb.3
+  
+  bb.2.for.cond.cleanup:
+    RET_ReallyLR
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %1:gpr64common = PHI %7, %bb.1, %2, %bb.3
+    %10:gpr32 = COPY %1.sub_32
+    %11:fpr32 = SCVTFUWSri %10, implicit $fpcr
+    %12:fpr32 = FADDSrr killed %11, %4, implicit $fpcr
+    STRSroX killed %12, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep, !tbaa !6)
+    %14:gpr32 = LDRWui %13, target-flags(aarch64-pageoff, aarch64-nc) @x :: (volatile dereferenceable load (s32) from @x, !tbaa !10)
+    %15:gpr32 = ADDWrr %10, killed %14
+    STRWui killed %15, %13, target-flags(aarch64-pageoff, aarch64-nc) @x :: (volatile store (s32) into @x, !tbaa !10)
+    %16:gpr64common = nuw nsw ADDXri %1, 1, 0
+    %2:gpr64all = COPY %16
+    dead $xzr = SUBSXrr %0, %16, implicit-def $nzcv
+    Bcc 0, %bb.2, implicit $nzcv
+    B %bb.3
+
+...
diff --git a/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions2.mir b/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions2.mir
new file mode 100644
index 0000000000000..85127fcc1c491
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sms-loop-carried-fp-exceptions2.mir
@@ -0,0 +1,100 @@
+# RUN: llc -mtriple=aarch64 -run-pass=pipeliner -debug-only=pipeliner -aarch64-enable-pipeliner -pipeliner-mve-cg %s -o /dev/null 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Test a case where fenv is enabled and there are no instructions forming a
+# barrier. Some instructions may raise floating-point exceptions, but no
+# loop-carried dependencies are added between them.
+
+# CHECK: Loop Carried Edges:
+# CHECK-NEXT: calculateResMII:
+
+--- |
+  define dso_local float @f(ptr nocapture noundef writeonly %a, float noundef %y, i32 noundef %n) local_unnamed_addr {
+  entry:
+    %conv = tail call float @llvm.experimental.constrained.fptrunc.f32.f64(double 1.000000e+00, metadata !"round.dynamic", metadata !"fpexcept.strict")
+    %cmp8 = icmp sgt i32 %n, 0
+    br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+  
+  for.body.preheader:
+    %wide.trip.count = zext nneg i32 %n to i64
+    br label %for.body
+  
+  for.cond.cleanup:
+    %acc.0.lcssa = phi float [ %conv, %entry ], [ %mul, %for.body ]
+    ret float %acc.0.lcssa
+  
+  for.body:
+    %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+    %acc.010 = phi float [ %conv, %for.body.preheader ], [ %mul, %for.body ]
+    %tmp = trunc i64 %indvars.iv to i32
+    %conv2 = tail call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %tmp, metadata !"round.dynamic", metadata !"fpexcept.strict")
+    %add = tail call float @llvm.experimental.constrained.fadd.f32(float %conv2, float %y, metadata !"round.dynamic", metadata !"fpexcept.strict")
+    %mul = tail call float @llvm.experimental.constrained.fmul.f32(float %acc.010, float %add, metadata !"round.dynamic", metadata !"fpexcept.strict")
+    %0 = shl nuw nsw i64 %indvars.iv, 2
+    %scevgep = getelementptr i8, ptr %a, i64 %0
+    store float %add, ptr %scevgep, align 4, !tbaa !6
+    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+    %exitcond.not = icmp eq i64 %wide.trip.count, %indvars.iv.next
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+  
+  declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata)
+  
+  declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata)
+  
+  declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+  
+  declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata)
+  
+  !6 = !{!7, !7, i64 0}
+  !7 = !{!"float", !8, i64 0}
+  !8 = !{!"omnipotent char", !9, i64 0}
+  !9 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $x0, $s0, $w1
+  
+    %9:gpr32common = COPY $w1
+    %8:fpr32 = COPY $s0
+    %7:gpr64common = COPY $x0
+    %10:fpr64 = FMOVDi 112
+    %0:fpr32 = FCVTSDr killed %10, implicit $fpcr
+    dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit $nzcv
+    B %bb.1
+  
+  bb.1.for.body.preheader:
+    %13:gpr32 = ORRWrs $wzr, %9, 0
+    %1:gpr64 = SUBREG_TO_REG 0, killed %13, %subreg.sub_32
+    %14:gpr64all = COPY $xzr
+    %12:gpr64all = COPY %14
+    B %bb.3
+  
+  bb.2.for.cond.cleanup:
+    %2:fpr32 = PHI %0, %bb.0, %5, %bb.3
+    $s0 = COPY %2
+    RET_ReallyLR implicit $s0
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %3:gpr64common = PHI %12, %bb.1, %6, %bb.3
+    %4:fpr32 = PHI %0, %bb.1, %5, %bb.3
+    %15:gpr32 = COPY %3.sub_32
+    %16:fpr32 = SCVTFUWSri killed %15, implicit $fpcr
+    %17:fpr32 = FADDSrr killed %16, %8, implicit $fpcr
+    %5:fpr32 = FMULSrr %4, %17, implicit $fpcr
+    STRSroX %17, %7, %3, 0, 1 :: (store (s32) into %ir.scevgep, !tbaa !6)
+    %18:gpr64common = nuw nsw ADDXri %3, 1, 0
+    %6:gpr64all = COPY %18
+    dead $xzr = SUBSXrr %1, %18, implicit-def $nzcv
+    Bcc 0, %bb.2, implicit $nzcv
+    B %bb.3
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir b/llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir
index afc989cbc6921..b42f98c44845e 100644
--- a/llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir
@@ -3,19 +3,30 @@
 
 # Test that the loop carried dependence check correctly identifies a recurrence.
 
-# CHECK: Overlap check:
-# CHECK-NEXT:   BaseMI:   S2_storerh_io %{{[0-9]+}}:intregs, 0, %{{[0-9]+}}:intregs :: (store (s16) into %ir.lsr.iv24)
+# CHECK:      Overlap check:
+# CHECK-NEXT:   BaseMI:   S2_storerh_io %12:intregs, 0, %18:intregs :: (store (s16) into %ir.lsr.iv24)
 # CHECK-NEXT:     Base + 0 + I * 4, Len: 2
-# CHECK-NEXT:   OtherMI:   %{{[0-9]+}}:intregs = L2_loadrh_io %{{[0-9]+}}:intregs, -8 :: (load (s16) from %ir.cgep10)
+# CHECK-NEXT:   OtherMI:   S2_storerh_io %12:intregs, 0, %18:intregs :: (store (s16) into %ir.lsr.iv24)
+# CHECK-NEXT:     Base + 0 + I * 4, Len: 2
+# CHECK-NEXT:   Result: No overlap
+# CHECK-NEXT: Overlap check:
+# CHECK-NEXT:   BaseMI:   S2_storerh_io %12:intregs, 0, %18:intregs :: (store (s16) into %ir.lsr.iv24)
+# CHECK-NEXT:     Base + 0 + I * 4, Len: 2
+# CHECK-NEXT:   OtherMI:   %17:intregs = L2_loadrh_io %12:intregs, -8 :: (load (s16) from %ir.cgep10)
 # CHECK-NEXT:     Base + -8 + I * 4, Len: 2
 # CHECK-NEXT:   Result: Overlap
+# CHECK-NEXT: Overlap check:
+# CHECK-NEXT:   BaseMI:   %17:intregs = L2_loadrh_io %12:intregs, -8 :: (load (s16) from %ir.cgep10)
+# CHECK-NEXT:     Base + -8 + I * 4, Len: 2
+# CHECK-NEXT:   OtherMI:   S2_storerh_io %12:intregs, 0, %18:intregs :: (store (s16) into %ir.lsr.iv24)
+# CHECK-NEXT:     Base + 0 + I * 4, Len: 2
+# CHECK-NEXT:   Result: No overlap
 
-# CHECK: Rec NodeSet
-# CHECK: Rec NodeSet
-# CHECK: Rec NodeSet
-# CHECK: Rec NodeSet
-# CHECK-NEXT: SU(4)
-# CHECK-NEXT: SU(6)
+# CHECK: Loop Carried Edges:
+# CHECK-NEXT:   Loop carried edges from SU(6)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(4)
+# CHECK-NEXT: calculateResMII:
 
 --- |
 
diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir b/llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir
index e16334ba7978f..cd5fb06ec411d 100644
--- a/llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir
@@ -4,23 +4,25 @@
 # Test that the loop carried dependence check correctly identifies dependences
 # when the loop variable decreases and the array index offset is negative.
 
-# No dependence from the store to the load.
-# CHECK: Overlap check:
-# CHECK-NEXT:   BaseMI:   S2_storeri_io %{{[0-9]+}}:intregs, 0, %{{[0-9]+}}:intregs :: (store (s32) into %ir.lsr.iv1)
+# CHECK:      Overlap check:
+# CHECK-NEXT:   BaseMI:   S2_storeri_io %2:intregs, 0, %7:intregs :: (store (s32) into %ir.lsr.iv1)
 # CHECK-NEXT:     Base + 0 + I * -4, Len: 4
-# CHECK-NEXT:   OtherMI:   %{{[0-9]+}}:intregs = L2_loadri_io %{{[0-9]+}}:intregs, -8 :: (load (s32) from %ir.cgep)
+# CHECK-NEXT:   OtherMI:   S2_storeri_io %2:intregs, 0, %7:intregs :: (store (s32) into %ir.lsr.iv1)
+# CHECK-NEXT:     Base + 0 + I * -4, Len: 4
+# CHECK-NEXT:   Result: No overlap
+# CHECK-NEXT: Overlap check:
+# CHECK-NEXT:   BaseMI:   S2_storeri_io %2:intregs, 0, %7:intregs :: (store (s32) into %ir.lsr.iv1)
+# CHECK-NEXT:     Base + 0 + I * -4, Len: 4
+# CHECK-NEXT:   OtherMI:   %5:intregs = L2_loadri_io %2:intregs, -8 :: (load (s32) from %ir.cgep)
 # CHECK-NEXT:     Base + -8 + I * -4, Len: 4
 # CHECK-NEXT:   Result: No overlap
+# CHECK-NEXT: Overlap check:
+# CHECK-NEXT:   BaseMI:   %5:intregs = L2_loadri_io %2:intregs, -8 :: (load (s32) from %ir.cgep)
+# CHECK-NEXT:     Base + -8 + I * -4, Len: 4
+# CHECK-NEXT:   OtherMI:   S2_storeri_io %2:intregs, 0, %7:intregs :: (store (s32) into %ir.lsr.iv1)
+# CHECK-NEXT:     Base + 0 + I * -4, Len: 4
+# CHECK-NEXT:   Result: Overlap
 
-# TODO: There is a loop carried dependence from the load to the store but it
-#   is not recognised. addLoopCarriedDependences() should be modified to
-#   recognise the dependence and enable the following checks.
-# CHECK-AFTER-FIX: Overlap check:
-# CHECK-AFTER-FIX-NEXT:   BaseMI:   %{{[0-9]+}}:intregs = L2_loadri_io %{{[0-9]+}}:intregs, -8 :: (load (s32) from %ir.cgep)
-# CHECK-AFTER-FIX-NEXT:     Base + -8 + I * -4, Len: 4
-# CHECK-AFTER-FIX-NEXT:   OtherMI:   S2_storeri_io %{{[0-9]+}}:intregs, 0, %{{[0-9]+}}:intregs :: (store (s32) into %ir.lsr.iv1)
-# CHECK-AFTER-FIX-NEXT:     Base + 0 + I * -4, Len: 4
-# CHECK-AFTER-FIX-NEXT:   Result: Overlap!
 
 --- |
 
diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-dep3.mir b/llvm/test/CodeGen/Hexagon/swp-carried-dep3.mir
index 91eb225580910..f47cafdde4731 100644
--- a/llvm/test/CodeGen/Hexagon/swp-carried-dep3.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-carried-dep3.mir
@@ -7,12 +7,24 @@
 # requires to use a single CHECK-NOT to match such a Rec NodeSet. Fortunately
 # the atom '.' does not match a newline but anything else on a line.
 
-# CHECK: Overlap check:
+# CHECK:      Overlap check:
+# CHECK-NEXT:   BaseMI:   %13:intregs = S2_storerh_pi %12:intregs(tied-def 0), 2, %20:intregs :: (store (s16))
+# CHECK-NEXT:     Base + 0 + I * 2, Len: 2
+# CHECK-NEXT:   OtherMI:   %13:intregs = S2_storerh_pi %12:intregs(tied-def 0), 2, %20:intregs :: (store (s16))
+# CHECK-NEXT:     Base + 0 + I * 2, Len: 2
+# CHECK-NEXT:   Result: No overlap
+# CHECK-NEXT: Overlap check:
 # CHECK-NEXT:   BaseMI:   %13:intregs = S2_storerh_pi %12:intregs(tied-def 0), 2, %20:intregs :: (store (s16))
 # CHECK-NEXT:     Base + 0 + I * 2, Len: 2
 # CHECK-NEXT:   OtherMI:   %19:intregs, %15:intregs = L2_loadrh_pi %14:intregs(tied-def 1), 2 :: (load (s16))
 # CHECK-NEXT:     Base + 0 + I * 2, Len: 2
 # CHECK-NEXT:   Result: No overlap
+# CHECK-NEXT: Overlap check:
+# CHECK-NEXT:   BaseMI:   %19:intregs, %15:intregs = L2_loadrh_pi %14:intregs(tied-def 1), 2 :: (load (s16))
+# CHECK-NEXT:     Base + 0 + I * 2, Len: 2
+# CHECK-NEXT:   OtherMI:   %13:intregs = S2_storerh_pi %12:intregs(tied-def 0), 2, %20:intregs :: (store (s16))
+# CHECK-NEXT:     Base + 0 + I * 2, Len: 2
+# CHECK-NEXT:   Result: No overlap
 
 # CHECK-NOT: Rec NodeSet{{.+[[:space:]]}} SU(5){{.+[[:space:]]}} SU(7)
 
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi9.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi9.ll
index af1b848a8cf2d..33421ce4b40e7 100644
--- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi9.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi9.ll
@@ -12,7 +12,7 @@
 ; CHECK: [[REG0]] = add(r{{[0-9]+}},#8)
 
 ; Function Attrs: nounwind
-define void @f0(ptr nocapture readonly %a0, i32 %a1) #0 {
+define void @f0(ptr noalias nocapture readonly %a0, i32 %a1, ptr noalias %a2) #0 {
 b0:
   %v0 = alloca [129 x i32], align 8
   br i1 undef, label %b1, label %b3
@@ -22,9 +22,9 @@ b1:                                               ; preds = %b0
 
 b2:                                               ; preds = %b2, %b1
   %v1 = phi ptr [ %a0, %b1 ], [ %v2, %b2 ]
-  %v2 = phi ptr [ undef, %b1 ], [ %v15, %b2 ]
-  %v3 = phi ptr [ null, %b1 ], [ %v4, %b2 ]
-  %v4 = phi ptr [ null, %b1 ], [ %v14, %b2 ]
+  %v2 = phi ptr [ %a0, %b1 ], [ %v15, %b2 ]
+  %v3 = phi ptr [ %a2, %b1 ], [ %v4, %b2 ]
+  %v4 = phi ptr [ %a2, %b1 ], [ %v14, %b2 ]
   %v5 = phi i32 [ 0, %b1 ], [ %v13, %b2 ]
   %v6 = phi ptr [ undef, %b1 ], [ %v12, %b2 ]
   %v7 = load i16, ptr %v2, align 2
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir
new file mode 100644
index 0000000000000..45ebc75171940
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep1.mir
@@ -0,0 +1,110 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
+# REQUIRES: asserts
+
+# Test that loop-carried memory dependencies are added correctly.
+# The original code is as follows.
+#
+# ```
+# void f(int *a, int n) {
+#   for (int i = 0; i < n-1; i++) {
+#     a[i] += a[i];
+#     a[i+1] += i;
+#   }
+# }
+# ```
+# 
+# Loop-carried dependencies exist from store of a[i+1] to load/store of a[i], but not vice versa.
+
+# CHECK: Loop Carried Edges:
+# CHECK-NEXT:   Loop carried edges from SU(6)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(4)
+# CHECK-NEXT:   Loop carried edges from SU(8)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(4)
+# CHECK-NEXT: calculateResMII:
+
+
+--- |
+  define dso_local void @f(ptr nocapture noundef %a, i32 noundef %n) local_unnamed_addr {
+  entry:
+    %cmp12 = icmp sgt i32 %n, 1
+    br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+  
+  for.body.preheader:
+    %.pre = load i32, ptr %a, align 4, !tbaa !5
+    %0 = add i32 %n, -1
+    %cgep = getelementptr i8, ptr %a, i32 4
+    br label %for.body
+  
+  for.cond.cleanup:
+    ret void
+  
+  for.body:
+    %lsr.iv14 = phi ptr [ %cgep, %for.body.preheader ], [ %cgep18, %for.body ]
+    %lsr.iv = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %1 = phi i32 [ %add4, %for.body ], [ %.pre, %for.body.preheader ]
+    %i.013 = phi i32 [ %add2, %for.body ], [ 0, %for.body.preheader ]
+    %add = shl nsw i32 %1, 1
+    %cgep17 = getelementptr i8, ptr %lsr.iv14, i32 -4
+    store i32 %add, ptr %cgep17, align 4, !tbaa !5
+    %add2 = add nuw nsw i32 %i.013, 1
+    %2 = load i32, ptr %lsr.iv14, align 4, !tbaa !5
+    %add4 = add nsw i32 %2, %i.013
+    %3 = add i32 %i.013, %2
+    store i32 %3, ptr %lsr.iv14, align 4, !tbaa !5
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    %cgep18 = getelementptr i8, ptr %lsr.iv14, i32 4
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+  
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"int", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $r0, $r1
+  
+    %12:intregs = COPY $r1
+    %11:intregs = COPY $r0
+    %13:predregs = C2_cmpgti %12, 1
+    J2_jumpf %13, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.for.body.preheader:
+    %0:intregs, %2:intregs = L2_loadri_pi %11, 4 :: (load (s32) from %ir.a, !tbaa !5)
+    %1:intregs = A2_addi %12, -1
+    %15:intregs = A2_tfrsi 0
+    %19:intregs = COPY %1
+    J2_loop0r %bb.3, %19, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.2.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %3:intregs = PHI %2, %bb.1, %10, %bb.3
+    %5:intregs = PHI %0, %bb.1, %8, %bb.3
+    %6:intregs = PHI %15, %bb.1, %7, %bb.3
+    %16:intregs = nsw S2_asl_i_r %5, 1
+    S2_storeri_io %3, -4, killed %16 :: (store (s32) into %ir.cgep17, !tbaa !5)
+    %7:intregs = nuw nsw A2_addi %6, 1
+    %17:intregs = L2_loadri_io %3, 0 :: (load (s32) from %ir.lsr.iv14, !tbaa !5)
+    %8:intregs = A2_add killed %17, %6
+    S2_storeri_io %3, 0, %8 :: (store (s32) into %ir.lsr.iv14, !tbaa !5)
+    %10:intregs = A2_addi %3, 4
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir
new file mode 100644
index 0000000000000..2d02e7e64d4d6
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep2.mir
@@ -0,0 +1,104 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
+# REQUIRES: asserts
+
+# Test that loop-carried memory dependencies are added correctly.
+# The original code is as follows.
+#
+# ```
+# void f(int *a, int n) {
+#   for (int i = 1; i < n; i++) {
+#     a[i] += a[i];
+#     a[i-1] += i;
+#   }
+# }
+# ```
+# 
+# Loop-carried dependencies exist from store of load/store of a[i] to store of a[i-1], but not vice versa.
+
+# CHECK: Loop Carried Edges:
+# CHECK-NEXT:   Loop carried edges from SU(3)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(7)
+# CHECK-NEXT:   Loop carried edges from SU(5)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(7)
+# CHECK-NEXT: calculateResMII:
+
+--- |
+  define dso_local void @f(ptr nocapture noundef %a, i32 noundef %n) local_unnamed_addr {
+  entry:
+    %cmp11 = icmp sgt i32 %n, 1
+    br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+  
+  for.body.preheader:
+    %load_initial = load i32, ptr %a, align 4
+    %cgep = getelementptr i8, ptr %a, i32 4
+    br label %for.body
+  
+  for.cond.cleanup:
+    ret void
+  
+  for.body:
+    %lsr.iv = phi ptr [ %cgep, %for.body.preheader ], [ %cgep16, %for.body ]
+    %store_forwarded = phi i32 [ %load_initial, %for.body.preheader ], [ %add, %for.body ]
+    %i.012 = phi i32 [ 1, %for.body.preheader ], [ %inc, %for.body ]
+    %0 = load i32, ptr %lsr.iv, align 4, !tbaa !5
+    %add = shl nsw i32 %0, 1
+    store i32 %add, ptr %lsr.iv, align 4, !tbaa !5
+    %1 = add i32 %store_forwarded, %i.012
+    %cgep15 = getelementptr i8, ptr %lsr.iv, i32 -4
+    store i32 %1, ptr %cgep15, align 4, !tbaa !5
+    %inc = add nuw nsw i32 %i.012, 1
+    %exitcond.not = icmp eq i32 %n, %inc
+    %cgep16 = getelementptr i8, ptr %lsr.iv, i32 4
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+  
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"int", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $r0, $r1
+  
+    %9:intregs = COPY $r1
+    %8:intregs = COPY $r0
+    %10:predregs = C2_cmpgti %9, 1
+    J2_jumpf %10, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.for.body.preheader:
+    %0:intregs, %1:intregs = L2_loadri_pi %8, 4 :: (load (s32) from %ir.a)
+    %12:intregs = A2_tfrsi 1
+    %16:intregs = A2_addi %9, -1
+    %17:intregs = COPY %16
+    J2_loop0r %bb.3, %17, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.2.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.3.for.body (machine-block-address-taken):
+    successors: %bb.2(0x04000000), %bb.3(0x7c000000)
+  
+    %2:intregs = PHI %1, %bb.1, %7, %bb.3
+    %3:intregs = PHI %0, %bb.1, %5, %bb.3
+    %4:intregs = PHI %12, %bb.1, %6, %bb.3
+    %13:intregs = L2_loadri_io %2, 0 :: (load (s32) from %ir.lsr.iv, !tbaa !5)
+    %5:intregs = nsw S2_asl_i_r killed %13, 1
+    S2_storeri_io %2, 0, %5 :: (store (s32) into %ir.lsr.iv, !tbaa !5)
+    %14:intregs = A2_add %3, %4
+    S2_storeri_io %2, -4, killed %14 :: (store (s32) into %ir.cgep15, !tbaa !5)
+    %6:intregs = nuw nsw A2_addi %4, 1
+    %7:intregs = A2_addi %2, 4
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir
new file mode 100644
index 0000000000000..16559a0230240
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep3.mir
@@ -0,0 +1,108 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
+# REQUIRES: asserts
+
+# Test that loop-carried memory dependencies are added correctly.
+# The original code is as follows.
+#
+# ```
+# void f(int * restrict a, int * restrict b, int n) {
+#   for (int i = 0; i < n; i++) {
+#     a[i] += i;
+#     b[i] += a[i+1];
+#   }
+# }
+# ```
+# 
+# Loop-carried dependencies exist from load for a[i+1] to store for a[i].
+
+# CHECK: Loop Carried Edges:
+# CHECK-NEXT:   Loop carried edges from SU(7)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(5)
+# CHECK-NEXT: calculateResMII:
+
+--- |
+  define dso_local void @f(ptr noalias nocapture noundef %a, ptr noalias nocapture noundef %b, i32 noundef %n) local_unnamed_addr {
+  entry:
+    %cmp11 = icmp sgt i32 %n, 0
+    br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
+  
+  for.body.preheader:
+    %.pre = load i32, ptr %a, align 4, !tbaa !5
+    %cgep = getelementptr i8, ptr %a, i32 4
+    br label %for.body
+  
+  for.cond.cleanup:
+    ret void
+  
+  for.body:
+    %lsr.iv15 = phi ptr [ %cgep, %for.body.preheader ], [ %cgep20, %for.body ]
+    %lsr.iv13 = phi i32 [ %n, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %lsr.iv = phi ptr [ %b, %for.body.preheader ], [ %cgep19, %for.body ]
+    %0 = phi i32 [ %2, %for.body ], [ %.pre, %for.body.preheader ]
+    %i.012 = phi i32 [ %add1, %for.body ], [ 0, %for.body.preheader ]
+    %1 = add i32 %0, %i.012
+    %cgep18 = getelementptr i8, ptr %lsr.iv15, i32 -4
+    store i32 %1, ptr %cgep18, align 4, !tbaa !5
+    %add1 = add nuw nsw i32 %i.012, 1
+    %2 = load i32, ptr %lsr.iv15, align 4, !tbaa !5
+    %3 = load i32, ptr %lsr.iv, align 4, !tbaa !5
+    %add4 = add nsw i32 %3, %2
+    store i32 %add4, ptr %lsr.iv, align 4, !tbaa !5
+    %lsr.iv.next = add i32 %lsr.iv13, -1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    %cgep19 = getelementptr i8, ptr %lsr.iv, i32 4
+    %cgep20 = getelementptr i8, ptr %lsr.iv15, i32 4
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+  
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"int", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $r0, $r1, $r2
+  
+    %14:intregs = COPY $r2
+    %13:intregs = COPY $r1
+    %12:intregs = COPY $r0
+    %15:predregs = C2_cmpgti %14, 0
+    J2_jumpf %15, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.for.body.preheader:
+    %0:intregs, %1:intregs = L2_loadri_pi %12, 4 :: (load (s32) from %ir.a, !tbaa !5)
+    %17:intregs = A2_tfrsi 0
+    %22:intregs = COPY %14
+    J2_loop0r %bb.3, %22, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.2.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %2:intregs = PHI %1, %bb.1, %11, %bb.3
+    %4:intregs = PHI %13, %bb.1, %10, %bb.3
+    %5:intregs = PHI %0, %bb.1, %8, %bb.3
+    %6:intregs = PHI %17, %bb.1, %7, %bb.3
+    %18:intregs = A2_add %5, %6
+    S2_storeri_io %2, -4, killed %18 :: (store (s32) into %ir.cgep18, !tbaa !5)
+    %7:intregs = nuw nsw A2_addi %6, 1
+    %8:intregs = L2_loadri_io %2, 0 :: (load (s32) from %ir.lsr.iv15, !tbaa !5)
+    %19:intregs = L2_loadri_io %4, 0 :: (load (s32) from %ir.lsr.iv, !tbaa !5)
+    %20:intregs = nsw A2_add killed %19, %8
+    %10:intregs = S2_storeri_pi %4, 4, killed %20 :: (store (s32) into %ir.lsr.iv, !tbaa !5)
+    %11:intregs = A2_addi %2, 4
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir
new file mode 100644
index 0000000000000..cc85d24e27b37
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep4.mir
@@ -0,0 +1,107 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
+# REQUIRES: asserts
+
+# Test that loop carried memory dependences are computed correctly.
+# The original code is as follows.
+#
+# ```
+# void f(int *a, int n) {
+#   for (int i = 0; i < n-2; i++) {
+#     a[i] += a[i+10];
+#     a[i+2] += i;
+#   }
+# }
+# ```
+#
+# Here is what each instruction does.
+# SU(2): Load a[i+10]
+# SU(3): Store it to a[i]
+# SU(4): Load a[i+2], add i, then store it
+
+# CHECK: Loop Carried Edges:
+# CHECK-NEXT:   Loop carried edges from SU(2)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(3)
+# CHECK-NEXT:       SU(4)
+# CHECK-NEXT:   Loop carried edges from SU(4)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(3)
+# CHECK-NEXT: calculateResMII:
+
+--- |
+  define dso_local void @f(ptr nocapture noundef %a, i32 noundef %n) {
+  entry:
+    %cmp13 = icmp sgt i32 %n, 2
+    br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
+  
+  for.body.preheader:
+    %0 = add i32 %n, -2
+    br label %for.body
+  
+  for.cond.cleanup:
+    ret void
+  
+  for.body:
+    %lsr.iv15 = phi ptr [ %a, %for.body.preheader ], [ %cgep19, %for.body ]
+    %lsr.iv = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %i.014 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+    %cgep = getelementptr i8, ptr %lsr.iv15, i32 40
+    %1 = load i32, ptr %cgep, align 4, !tbaa !5
+    %2 = load i32, ptr %lsr.iv15, align 4, !tbaa !5
+    %add2 = add nsw i32 %2, %1
+    store i32 %add2, ptr %lsr.iv15, align 4, !tbaa !5
+    %cgep18 = getelementptr i8, ptr %lsr.iv15, i32 8
+    %3 = load i32, ptr %cgep18, align 4, !tbaa !5
+    %4 = add i32 %i.014, %3
+    store i32 %4, ptr %cgep18, align 4, !tbaa !5
+    %inc = add nuw nsw i32 %i.014, 1
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    %cgep19 = getelementptr i8, ptr %lsr.iv15, i32 4
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+  
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"int", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $r0, $r1
+  
+    %8:intregs = COPY $r1
+    %7:intregs = COPY $r0
+    %9:predregs = C2_cmpgti %8, 2
+    J2_jumpf %9, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.for.body.preheader:
+    %0:intregs = A2_addi %8, -2
+    %11:intregs = A2_tfrsi 0
+    %14:intregs = COPY %0
+    J2_loop0r %bb.3, %14, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.2.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %1:intregs = PHI %7, %bb.1, %6, %bb.3
+    %3:intregs = PHI %11, %bb.1, %4, %bb.3
+    %12:intregs = L2_loadri_io %1, 40 :: (load (s32) from %ir.cgep, !tbaa !5)
+    L4_add_memopw_io %1, 0, killed %12 :: (store (s32) into %ir.lsr.iv15, !tbaa !5), (load (s32) from %ir.lsr.iv15, !tbaa !5)
+    L4_add_memopw_io %1, 8, %3 :: (store (s32) into %ir.cgep18, !tbaa !5), (load (s32) from %ir.cgep18, !tbaa !5)
+    %4:intregs = nuw nsw A2_addi %3, 1
+    %6:intregs = A2_addi %1, 4
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir
new file mode 100644
index 0000000000000..a0b26b648c6d4
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep5.mir
@@ -0,0 +1,106 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
+# REQUIRES: asserts
+
+# Test that loop carried memory dependencies are correct when there are two arrays 
+# that may point to the same memory location.
+#
+# ```
+# void f(int *a, int *b, int n) {
+#   for (int i = 0; i < n; i++) {
+#     a[i] += b[i];
+#     b[i] += a[i];
+#   }
+# }
+# ```
+#
+# Here is what each instruction does.
+# SU(2): Load b[i]
+# SU(3): Load a[i]
+# SU(5): Store a[i]
+# SU(6): Load b[i]
+# SU(8): Store b[i]
+
+# CHECK: Loop Carried Edges:
+# CHECK-NEXT:   Loop carried edges from SU(5)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(2)
+# CHECK-NEXT:   Loop carried edges from SU(6)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(5)
+# CHECK-NEXT:   Loop carried edges from SU(8)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(3)
+# CHECK-NEXT:       SU(5)
+# CHECK-NEXT: calculateResMII:
+
+--- |
+  define dso_local void @f(ptr nocapture noundef %a, ptr nocapture noundef %b, i32 noundef %n) local_unnamed_addr {
+  entry:
+    %cmp12 = icmp sgt i32 %n, 0
+    br i1 %cmp12, label %for.body, label %for.cond.cleanup
+  
+  for.cond.cleanup:
+    ret void
+  
+  for.body:
+    %lsr.iv15 = phi ptr [ %cgep17, %for.body ], [ %b, %entry ]
+    %lsr.iv14 = phi ptr [ %cgep, %for.body ], [ %a, %entry ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %n, %entry ]
+    %0 = load i32, ptr %lsr.iv15, align 4, !tbaa !5
+    %1 = load i32, ptr %lsr.iv14, align 4, !tbaa !5
+    %add = add nsw i32 %1, %0
+    store i32 %add, ptr %lsr.iv14, align 4, !tbaa !5
+    %2 = load i32, ptr %lsr.iv15, align 4, !tbaa !5
+    %add4 = add nsw i32 %2, %add
+    store i32 %add4, ptr %lsr.iv15, align 4, !tbaa !5
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %exitcond.not = icmp eq i32 %lsr.iv.next, 0
+    %cgep = getelementptr i8, ptr %lsr.iv14, i32 4
+    %cgep17 = getelementptr i8, ptr %lsr.iv15, i32 4
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+  
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"int", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.3, %bb.1
+    liveins: $r0, $r1, $r2
+  
+    %8:intregs = COPY $r2
+    %7:intregs = COPY $r1
+    %6:intregs = COPY $r0
+    %9:predregs = C2_cmpgti %8, 0
+    J2_jumpf %9, %bb.1, implicit-def $pc
+  
+  bb.3:
+    %16:intregs = COPY %8
+    J2_loop0r %bb.2, %16, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.2, implicit-def $pc
+  
+  bb.1.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.2.for.body:
+    successors: %bb.1, %bb.2
+  
+    %0:intregs = PHI %7, %bb.3, %5, %bb.2
+    %1:intregs = PHI %6, %bb.3, %4, %bb.2
+    %10:intregs = L2_loadri_io %0, 0 :: (load (s32) from %ir.lsr.iv15, !tbaa !5)
+    %11:intregs = L2_loadri_io %1, 0 :: (load (s32) from %ir.lsr.iv14, !tbaa !5)
+    %12:intregs = nsw A2_add killed %11, killed %10
+    %4:intregs = S2_storeri_pi %1, 4, %12 :: (store (s32) into %ir.lsr.iv14, !tbaa !5)
+    %13:intregs = L2_loadri_io %0, 0 :: (load (s32) from %ir.lsr.iv15, !tbaa !5)
+    %14:intregs = nsw A2_add killed %13, %12
+    %5:intregs = S2_storeri_pi %0, 4, killed %14 :: (store (s32) into %ir.lsr.iv15, !tbaa !5)
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.1, implicit-def $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep6.mir b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep6.mir
new file mode 100644
index 0000000000000..57ca3589c6d23
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-order-dep6.mir
@@ -0,0 +1,153 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 -pipeliner-experimental-cg=true | FileCheck %s
+# REQUIRES: asserts
+
+# Test that loop carried memory dependencies are computed correctly
+# when instructions related to global memory object exist in the loop.
+# The original code is as follows.
+# 
+# ```
+# volatile int x = 0;
+# void f(int * restrict a, int * restrict b, int * restrict c, int n) {
+#   for (int i = 0; i < n; i++) {
+#     a[i] *= c[i];
+#     b[i] *= c[i];
+#     x += i;
+#     a[i + 1] *= i;
+#     x += i;
+#     b[i + 1] *= i;
+#   }
+# }
+# ```
+
+# CHECK: Loop Carried Edges:
+# CHECK-NEXT:   Loop carried edges from SU(16)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(6)
+# CHECK-NEXT:       SU(8)
+# CHECK-NEXT:       SU(10)
+# CHECK-NEXT:       SU(11)
+# CHECK-NEXT:   Loop carried edges from SU(17)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(10)
+# CHECK-NEXT:       SU(11)
+# CHECK-NEXT:   Loop carried edges from SU(19)
+# CHECK-NEXT:     Order
+# CHECK-NEXT:       SU(10)
+# CHECK-NEXT:       SU(11)
+# CHECK-NEXT: calculateResMII:
+
+--- |
+  @x = dso_local global i32 0, align 4
+  
+  define dso_local void @f(ptr noalias nocapture noundef %a, ptr noalias nocapture noundef %b, ptr noalias nocapture noundef readonly %c, i32 noundef %n) {
+  entry:
+    %cmp26 = icmp sgt i32 %n, 0
+    br i1 %cmp26, label %for.body.preheader, label %for.cond.cleanup
+  
+  for.body.preheader:
+    %.pre = load i32, ptr %a, align 4, !tbaa !5
+    %.pre28 = load i32, ptr %b, align 4, !tbaa !5
+    %cgep = getelementptr i8, ptr %b, i32 4
+    %cgep37 = getelementptr i8, ptr %a, i32 4
+    br label %for.body
+  
+  for.cond.cleanup:
+    ret void
+  
+  for.body:
+    %lsr.iv35 = phi ptr [ %c, %for.body.preheader ], [ %cgep42, %for.body ]
+    %lsr.iv31 = phi ptr [ %cgep37, %for.body.preheader ], [ %cgep41, %for.body ]
+    %lsr.iv = phi ptr [ %cgep, %for.body.preheader ], [ %cgep40, %for.body ]
+    %0 = phi i32 [ %mul11, %for.body ], [ %.pre28, %for.body.preheader ]
+    %1 = phi i32 [ %mul7, %for.body ], [ %.pre, %for.body.preheader ]
+    %i.027 = phi i32 [ %add5, %for.body ], [ 0, %for.body.preheader ]
+    %2 = load i32, ptr %lsr.iv35, align 4, !tbaa !5
+    %mul = mul nsw i32 %1, %2
+    %cgep38 = getelementptr i8, ptr %lsr.iv31, i32 -4
+    store i32 %mul, ptr %cgep38, align 4, !tbaa !5
+    %mul4 = mul nsw i32 %0, %2
+    %cgep39 = getelementptr i8, ptr %lsr.iv, i32 -4
+    store i32 %mul4, ptr %cgep39, align 4, !tbaa !5
+    %3 = load volatile i32, ptr @x, align 4, !tbaa !5
+    %4 = add i32 %i.027, %3
+    store volatile i32 %4, ptr @x, align 4, !tbaa !5
+    %add5 = add nuw nsw i32 %i.027, 1
+    %5 = load i32, ptr %lsr.iv31, align 4, !tbaa !5
+    %mul7 = mul nsw i32 %5, %i.027
+    store i32 %mul7, ptr %lsr.iv31, align 4, !tbaa !5
+    %6 = load volatile i32, ptr @x, align 4, !tbaa !5
+    %7 = add i32 %i.027, %6
+    store volatile i32 %7, ptr @x, align 4, !tbaa !5
+    %8 = load i32, ptr %lsr.iv, align 4, !tbaa !5
+    %mul11 = mul nsw i32 %8, %i.027
+    store i32 %mul11, ptr %lsr.iv, align 4, !tbaa !5
+    %exitcond.not = icmp eq i32 %n, %add5
+    %cgep40 = getelementptr i8, ptr %lsr.iv, i32 4
+    %cgep41 = getelementptr i8, ptr %lsr.iv31, i32 4
+    %cgep42 = getelementptr i8, ptr %lsr.iv35, i32 4
+    br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+  }
+  
+  !5 = !{!6, !6, i64 0}
+  !6 = !{!"int", !7, i64 0}
+  !7 = !{!"omnipotent char", !8, i64 0}
+  !8 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            f
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: $r0, $r1, $r2, $r3
+  
+    %19:intregs = COPY $r3
+    %18:intregs = COPY $r2
+    %17:intregs = COPY $r1
+    %16:intregs = COPY $r0
+    %20:predregs = C2_cmpgti %19, 0
+    J2_jumpf %20, %bb.2, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.for.body.preheader:
+    %0:intregs, %3:intregs = L2_loadri_pi %16, 4 :: (load (s32) from %ir.a, !tbaa !5)
+    %1:intregs, %2:intregs = L2_loadri_pi %17, 4 :: (load (s32) from %ir.b, !tbaa !5)
+    %22:intregs = A2_tfrsi 0
+    %26:intregs = C4_addipc target-flags(hexagon-pcrel) @x
+    %30:intregs = COPY %19
+    J2_loop0r %bb.3, %30, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.2.for.cond.cleanup:
+    PS_jmpret $r31, implicit-def dead $pc
+  
+  bb.3.for.body:
+    successors: %bb.2, %bb.3
+  
+    %4:intregs = PHI %18, %bb.1, %15, %bb.3
+    %5:intregs = PHI %3, %bb.1, %14, %bb.3
+    %6:intregs = PHI %2, %bb.1, %13, %bb.3
+    %7:intregs = PHI %1, %bb.1, %12, %bb.3
+    %8:intregs = PHI %0, %bb.1, %11, %bb.3
+    %9:intregs = PHI %22, %bb.1, %10, %bb.3
+    %23:intregs, %15:intregs = L2_loadri_pi %4, 4 :: (load (s32) from %ir.lsr.iv35, !tbaa !5)
+    %24:intregs = nsw M2_mpyi %8, %23
+    S2_storeri_io %5, -4, killed %24 :: (store (s32) into %ir.cgep38, !tbaa !5)
+    %25:intregs = nsw M2_mpyi %7, %23
+    S2_storeri_io %6, -4, killed %25 :: (store (s32) into %ir.cgep39, !tbaa !5)
+    L4_add_memopw_io %26, 0, %9 :: (volatile store (s32) into @x, !tbaa !5), (volatile dereferenceable load (s32) from @x, !tbaa !5)
+    %10:intregs = nuw nsw A2_addi %9, 1
+    %27:intregs = L2_loadri_io %5, 0 :: (load (s32) from %ir.lsr.iv31, !tbaa !5)
+    %11:intregs = nsw M2_mpyi killed %27, %9
+    S2_storeri_io %5, 0, %11 :: (store (s32) into %ir.lsr.iv31, !tbaa !5)
+    L4_add_memopw_io %26, 0, %9 :: (volatile store (s32) into @x, !tbaa !5), (volatile dereferenceable load (s32) from @x, !tbaa !5)
+    %28:intregs = L2_loadri_io %6, 0 :: (load (s32) from %ir.lsr.iv, !tbaa !5)
+    %12:intregs = nsw M2_mpyi killed %28, %9
+    S2_storeri_io %6, 0, %12 :: (store (s32) into %ir.lsr.iv, !tbaa !5)
+    %13:intregs = A2_addi %6, 4
+    %14:intregs = A2_addi %5, 4
+    ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-loop-carried-unknown.ll b/llvm/test/CodeGen/Hexagon/swp-loop-carried-unknown.ll
index 4983af7482508..f4c3133839855 100644
--- a/llvm/test/CodeGen/Hexagon/swp-loop-carried-unknown.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-loop-carried-unknown.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -hexagon-initial-cfg-cleanup=0 < %s -pipeliner-experimental-cg=true | FileCheck %s
+; RUN: llc -mtriple=hexagon -hexagon-initial-cfg-cleanup=0 -pipeliner-force-ii=3 < %s -pipeliner-experimental-cg=true | FileCheck %s
 
 ; Test that the pipeliner schedules a store before the load in which there is a
 ; loop carried dependence. Previously, the loop carried dependence wasn't added
diff --git a/llvm/test/CodeGen/Hexagon/swp-phi-start.ll b/llvm/test/CodeGen/Hexagon/swp-phi-start.ll
index 52c258656ec22..6a0a4c8f2f55a 100644
--- a/llvm/test/CodeGen/Hexagon/swp-phi-start.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-phi-start.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -enable-pipeliner -pipeliner-max-stages=2 -disable-packetizer < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -enable-pipeliner -pipeliner-max-stages=2 -pipeliner-force-ii=3 -disable-packetizer < %s | FileCheck %s
 
 ; Test that the early start and late start values are computed correctly
 ; when a Phi depends on another Phi. In this case, they should occur in
diff --git a/llvm/test/CodeGen/Hexagon/swp-resmii-1.ll b/llvm/test/CodeGen/Hexagon/swp-resmii-1.ll
index c6bb4a6d570f4..cd3c5ed58c8b2 100644
--- a/llvm/test/CodeGen/Hexagon/swp-resmii-1.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-resmii-1.ll
@@ -3,7 +3,7 @@
 
 ; Test that checks that we compute the correct ResMII for haar.
 
-; CHECK: MII = 4 MAX_II = 14 (rec=1, res=4)
+; CHECK: MII = {{[0-9]+}} MAX_II = {{[0-9]+}} (rec={{[0-9]+}}, res=4)
 
 ; Function Attrs: nounwind
 define void @f0(ptr noalias nocapture readonly %a0, i32 %a1, i32 %a2, i32 %a3, ptr noalias nocapture %a4, i32 %a5) #0 {
diff --git a/llvm/test/CodeGen/Hexagon/vect/vect-shuffle.ll b/llvm/test/CodeGen/Hexagon/vect/vect-shuffle.ll
index 09746b30a4c18..c41d7b2a4e3ec 100644
--- a/llvm/test/CodeGen/Hexagon/vect/vect-shuffle.ll
+++ b/llvm/test/CodeGen/Hexagon/vect/vect-shuffle.ll
@@ -1,12 +1,13 @@
 ; REQUIRES: asserts
 ; RUN: llc -mtriple=hexagon -mcpu=hexagonv5 -disable-hsdr --stats -o - 2>&1 < %s | FileCheck %s
 
+; NOTE: Node order issues are reported by pipelier, but they are false positives.
+
 ; Check that store is post-incremented.
 ; CHECK-NOT: extractu(r{{[0-9]+}},#32,
 ; CHECK-NOT: insert
-; CHECK-NOT: Number of node order issues found
+; CHECK: Number of node order issues found
 ; CHECK: Number of loops software pipelined
-; CHECK-NOT: Number of node order issues found
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
 target triple = "hexagon"
 
diff --git a/llvm/test/CodeGen/PowerPC/sms-store-dependence.ll b/llvm/test/CodeGen/PowerPC/sms-store-dependence.ll
index d1ec320d55680..919a31f35b72b 100644
--- a/llvm/test/CodeGen/PowerPC/sms-store-dependence.ll
+++ b/llvm/test/CodeGen/PowerPC/sms-store-dependence.ll
@@ -1,51 +1,18 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs\
-; RUN:       -mcpu=pwr9 --ppc-enable-pipeliner 2>&1 | FileCheck %s
+; RUN:       -mcpu=pwr9 --ppc-enable-pipeliner --debug-only=pipeliner 2>&1 | FileCheck %s
 
 ; Test that the pipeliner schedules the store instructions correctly. Since
 ; there is a dependence between the store, they cannot be scheduled further than
 ; MII cycles/instructions apart. That is, the first store cannot occur multiple
 ; times before the second ctore in the schedule.
+
+; CHECK: SU([[STORE0:[0-9]+]]): {{.*}} (store (s8) {{.*}})
+; CHECK: SU([[STORE1:[0-9]+]]): {{.*}} (store (s8) {{.*}})
+; CHECK: Schedule Found? 1
+; CHECK: cycle [[#CYCLE0:]] (1) ([[STORE1]])
+; CHECK: cycle [[#CYCLE0+1]]
+; CHECK: cycle {{[0-9]+}} (0) ([[STORE0]])
 define dso_local void @comp_method(ptr noalias nocapture noundef readonly %0, ptr nocapture noundef writeonly %1, ptr nocapture noundef writeonly %2, i32 noundef %3, i32 noundef %4, i32 noundef %5, i32 noundef %6, i64 %v1) local_unnamed_addr {
-; CHECK-LABEL: comp_method:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    extsw 7, 8
-; CHECK-NEXT:    extsw 8, 9
-; CHECK-NEXT:    clrldi 9, 6, 32
-; CHECK-NEXT:    addi 6, 3, -1
-; CHECK-NEXT:    mtctr 9
-; CHECK-NEXT:    li 11, 0
-; CHECK-NEXT:    sradi 12, 11, 2
-; CHECK-NEXT:    add 5, 5, 8
-; CHECK-NEXT:    li 8, 2
-; CHECK-NEXT:    li 3, 8
-; CHECK-NEXT:    addi 11, 7, 0
-; CHECK-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; CHECK-NEXT:    lbzu 9, 1(6)
-; CHECK-NEXT:    add 12, 12, 10
-; CHECK-NEXT:    extsb 9, 9
-; CHECK-NEXT:    stbx 8, 4, 9
-; CHECK-NEXT:    add 9, 9, 12
-; CHECK-NEXT:    bdz .LBB0_2
-; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  .LBB0_1:
-; CHECK-NEXT:    lbzu 0, 1(6)
-; CHECK-NEXT:    sradi 12, 11, 2
-; CHECK-NEXT:    add 11, 11, 7
-; CHECK-NEXT:    add 12, 12, 10
-; CHECK-NEXT:    sldi 30, 9, 2
-; CHECK-NEXT:    add 9, 9, 30
-; CHECK-NEXT:    extsb 0, 0
-; CHECK-NEXT:    stbx 3, 5, 9
-; CHECK-NEXT:    add 9, 0, 12
-; CHECK-NEXT:    stbx 8, 4, 0
-; CHECK-NEXT:    bdnz .LBB0_1
-; CHECK-NEXT:  .LBB0_2:
-; CHECK-NEXT:    sldi 4, 9, 2
-; CHECK-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; CHECK-NEXT:    add 4, 9, 4
-; CHECK-NEXT:    stbx 3, 5, 4
-; CHECK-NEXT:    blr
   %8 = icmp sgt i32 %3, 64
   tail call void @llvm.assume(i1 %8)
   %9 = and i32 %3, 1
diff --git a/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir b/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
index 4554f9f1fa23a..492a4ddfa28d2 100644
--- a/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
+++ b/llvm/test/CodeGen/Thumb2/pipeliner-preserve-ties.mir
@@ -2,7 +2,6 @@
 # RUN: llc -mtriple=thumbv7-none-eabi -mcpu=cortex-m7 -O3 -run-pass=tbaa,pipeliner %s -o - | FileCheck %s
 
 --- |
-  ; ModuleID = 'test.ll'
   source_filename = "test.ll"
   target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
   target triple = "thumbv7m-none-unknown-eabi"

>From 28117dbfef5bde36eb64b4757de0dc39b6887370 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Thu, 17 Apr 2025 12:30:36 +0000
Subject: [PATCH 2/6] Fix the handlings of memoperands

---
 llvm/include/llvm/CodeGen/MachinePipeliner.h  |   3 -
 llvm/lib/CodeGen/MachinePipeliner.cpp         | 237 +++++++++---------
 .../test/CodeGen/Hexagon/swp-carried-dep1.mir |   6 -
 .../test/CodeGen/Hexagon/swp-carried-dep2.mir |   6 -
 .../test/CodeGen/Hexagon/swp-carried-dep3.mir |   6 -
 llvm/test/CodeGen/Hexagon/swp-epilog-phi13.ll |   6 +-
 llvm/test/CodeGen/Hexagon/swp-no-alias.mir    |  88 +++++++
 7 files changed, 205 insertions(+), 147 deletions(-)
 create mode 100644 llvm/test/CodeGen/Hexagon/swp-no-alias.mir

diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index c8a5240447363..2182cd9a731c6 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -442,9 +442,6 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
     return ScheduleInfo[Node->NodeNum].ZeroLatencyHeight;
   }
 
-  bool hasLoopCarriedMemDep(const MachineInstr *Src, const MachineInstr *Dst,
-                            BatchAAResults *BAA) const;
-
   void applyInstrChange(MachineInstr *MI, SMSchedule &Schedule);
 
   void fixupRegisterOverlaps(std::deque<SUnit *> &Instrs);
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 675149a992a7d..8f255717d0b56 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -825,81 +825,38 @@ static bool isGlobalMemoryObject(MachineInstr &MI) {
 /// Return the underlying objects for the memory references of an instruction.
 /// This function calls the code in ValueTracking, but first checks that the
 /// instruction has a memory operand.
-static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
-                                         SmallVectorImpl<const Value *> &Objs) {
+static bool getUnderlyingObjectsForInstr(const MachineInstr *MI,
+                                         SmallVectorImpl<const Value *> &Objs,
+                                         AAMDNodes *AA) {
+  *AA = AAMDNodes();
   if (!MI->hasOneMemOperand())
-    return;
+    return false;
   MachineMemOperand *MM = *MI->memoperands_begin();
   if (!MM->getValue())
-    return;
+    return false;
   getUnderlyingObjects(MM->getValue(), Objs);
-  for (const Value *V : Objs) {
-    if (!isIdentifiedObject(V)) {
-      Objs.clear();
-      return;
-    }
-  }
-}
-
-static std::optional<MemoryLocation>
-getMemoryLocationForAA(const MachineInstr *MI) {
-  const MachineMemOperand *MMO = *MI->memoperands_begin();
-  const Value *Val = MMO->getValue();
-  if (!Val)
-    return std::nullopt;
-  auto MemLoc = MemoryLocation::getBeforeOrAfter(Val, MMO->getAAInfo());
-
-  // Peel off noalias information from `AATags` because it might be valid only
-  // in single iteration.
-  // FIXME: This is too conservative. Checking
-  // `llvm.experimental.noalias.scope.decl` instrinsics in the original LLVM IR
-  // can perform more accuurately.
-  // MemLoc.AATags.NoAlias = nullptr;
-  return MemLoc;
+  *AA = MM->getAAInfo();
+  return true;
 }
 
-/// Return true for an memory dependence that is loop carried
-/// potentially. A dependence is loop carried if the destination defines a value
-/// that may be used or defined by the source in a subsequent iteration.
-bool SwingSchedulerDAG::hasLoopCarriedMemDep(const MachineInstr *Src,
-                                             const MachineInstr *Dst,
-                                             BatchAAResults *BAA) const {
-  if (!SwpPruneLoopCarried)
-    return true;
-
-  // First, check the dependence by comparing base register, offset, and
-  // step value of the loop.
-  switch (mayOverlapInLaterIter(Src, Dst)) {
-  case AliasResult::Kind::MustAlias:
-    return true;
-  case AliasResult::Kind::NoAlias:
-    return false;
-  case AliasResult::Kind::MayAlias:
-    break;
-  default:
-    llvm_unreachable("Unexpected alias");
-  }
-
-  // If we cannot determine the dependence by previouse check, then
-  // check by using alias analysis.
-  if (!BAA)
-    return true;
-
-  const auto MemLoc1 = getMemoryLocationForAA(Src);
-  const auto MemLoc2 = getMemoryLocationForAA(Dst);
-  if (!MemLoc1.has_value() || !MemLoc2.has_value())
-    return true;
-  switch (BAA->alias(*MemLoc1, *MemLoc2)) {
-  case AliasResult::Kind::MayAlias:
-  case AliasResult::Kind::MustAlias:
-  case AliasResult::Kind::PartialAlias:
-    return true;
-  case AliasResult::Kind::NoAlias:
-    return false;
-  default:
-    llvm_unreachable("Unexpected alias");
-  }
-}
+// static std::optional<MemoryLocation>
+// getMemoryLocationForAA(const MachineInstr *MI, const Value *Val) {
+//   const MachineMemOperand *MMO = *MI->memoperands_begin();
+//   const Value *Val = MMO->getValue();
+//   if (!Val)
+//     return std::nullopt;
+//   auto MemLoc = MemoryLocation::getBeforeOrAfter(Val, MMO->getAAInfo());
+//
+//   // Peel off noalias information from `AATags` because it might be valid
+//   only
+//   // in single iteration.
+//   // FIXME: This is too conservative. Checking
+//   // `llvm.experimental.noalias.scope.decl` instrinsics in the original LLVM
+//   IR
+//   // can perform more accuurately.
+//   // MemLoc.AATags.NoAlias = nullptr;
+//   return MemLoc;
+// }
 
 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
 /// processes dependences for PHIs. This function adds true dependences
@@ -1523,6 +1480,38 @@ class HighRegisterPressureDetector {
   }
 };
 
+struct SUnitWithMemInfo {
+  SUnit *SU;
+  SmallVector<const Value *, 2> Objs;
+  AAMDNodes AATags;
+  bool IsAllIdentified = false;
+  bool IsUnknown = true;
+
+  SUnitWithMemInfo(SUnit *SU) : SU(SU) { init(); }
+
+  bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const {
+    if (!IsAllIdentified || !Other.IsAllIdentified)
+      return false;
+    for (const Value *Obj : Objs)
+      if (llvm::is_contained(Other.Objs, Obj))
+        return false;
+    return true;
+  }
+
+private:
+  void init() {
+    if (!getUnderlyingObjectsForInstr(SU->getInstr(), Objs, &AATags))
+      return;
+
+    IsUnknown = false;
+    for (const Value *Obj : Objs)
+      if (!isIdentifiedObject(Obj)) {
+        IsAllIdentified = false;
+        break;
+      }
+  }
+};
+
 /// Add loop-carried chain dependencies. This class handles the same type of
 /// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
 /// account dependencies across iterations.
@@ -1555,13 +1544,12 @@ class LoopCarriedOrderDepsTracker {
 
   // Retains loads and stores classified by the underlying objects.
   struct LoadStoreChunk {
-    Value2SUs Loads, Stores;
-    SUsType UnknownLoads, UnknownStores;
+    SmallVector<SUnitWithMemInfo, 4> Loads;
+    SmallVector<SUnitWithMemInfo, 4> Stores;
   };
 
   SwingSchedulerDAG *DAG;
   std::unique_ptr<BatchAAResults> BAA;
-  const Value *UnknownValue;
   std::vector<SUnit> &SUnits;
 
   // The size of SUnits, for convenience.
@@ -1589,8 +1577,6 @@ class LoopCarriedOrderDepsTracker {
   LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, AAResults *AA)
       : DAG(SSD), BAA(nullptr), SUnits(DAG->SUnits), N(SUnits.size()),
         AdjMatrix(N, BitVector(N)), LoopCarried(N, BitVector(N)) {
-    UnknownValue =
-        UndefValue::get(Type::getVoidTy(DAG->MF.getFunction().getContext()));
     if (AA) {
       BAA = std::make_unique<BatchAAResults>(*AA);
       BAA->enableCrossIterationMode();
@@ -1666,69 +1652,74 @@ class LoopCarriedOrderDepsTracker {
     return std::nullopt;
   }
 
-  void addDependencesBetweenSUs(const SUsType &From, const SUsType &To) {
-    for (SUnit *SUa : From)
-      for (SUnit *SUb : To)
-        if (DAG->hasLoopCarriedMemDep(SUa->getInstr(), SUb->getInstr(),
-                                      BAA.get()))
-          LoopCarried[SUa->NodeNum].set(SUb->NodeNum);
+  bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
+                            const SUnitWithMemInfo &Dst) const {
+    if (!SwpPruneLoopCarried)
+      return true;
+
+    if (Src.isTriviallyDisjoint(Dst))
+      return false;
+
+    // First, check the dependence by comparing base register, offset, and
+    // step value of the loop.
+    switch (
+        DAG->mayOverlapInLaterIter(Src.SU->getInstr(), Dst.SU->getInstr())) {
+    case AliasResult::Kind::MustAlias:
+      return true;
+    case AliasResult::Kind::NoAlias:
+      return false;
+    case AliasResult::Kind::MayAlias:
+      break;
+    default:
+      llvm_unreachable("Unexpected alias");
+    }
+
+    // If we cannot determine the dependence by previouse check, then
+    // check by using alias analysis.
+    if (!BAA || Src.IsUnknown || Dst.IsUnknown)
+      return true;
+
+    for (const Value *SrcObj : Src.Objs)
+      for (const Value *DstObj : Dst.Objs) {
+        const auto SrcLoc =
+            MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags);
+        const auto DstLoc =
+            MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags);
+        if (!BAA->isNoAlias(SrcLoc, DstLoc))
+          return true;
+      }
+
+    return false;
   }
 
-  void addDependenciesOfObj(const SUsType &From, const Value *Obj,
-                            const Value2SUs &To) {
-    auto *Ite = To.find(Obj);
-    if (Ite != To.end())
-      addDependencesBetweenSUs(From, Ite->second);
+  void addDependencesBetweenSUs(const SUnitWithMemInfo &From,
+                                const SUnitWithMemInfo &To) {
+    if (From.SU == To.SU)
+      return;
+    if (hasLoopCarriedMemDep(From, To))
+      LoopCarried[From.SU->NodeNum].set(To.SU->NodeNum);
   }
 
   void addDependencesBetweenChunks(const LoadStoreChunk &From,
                                    const LoadStoreChunk &To) {
-    // Add dependencies from store with known object
-    for (auto &[Obj, Stores] : From.Stores) {
-      addDependenciesOfObj(Stores, Obj, To.Stores);
-      addDependenciesOfObj(Stores, Obj, To.Loads);
-      addDependencesBetweenSUs(Stores, To.UnknownStores);
-      addDependencesBetweenSUs(Stores, To.UnknownLoads);
-    }
+    for (const SUnitWithMemInfo &Src : From.Stores)
+      for (const SUnitWithMemInfo &Dst : To.Stores)
+        addDependencesBetweenSUs(Src, Dst);
 
-    // Add dependencies from load with known object
-    for (auto &[Obj, Loads] : From.Loads) {
-      addDependenciesOfObj(Loads, Obj, To.Stores);
-      addDependencesBetweenSUs(Loads, To.UnknownStores);
-    }
+    for (const SUnitWithMemInfo &Src : From.Stores)
+      for (const SUnitWithMemInfo &Dst : To.Loads)
+        addDependencesBetweenSUs(Src, Dst);
 
-    // Add dependencies from load/store with unknown object
-    for ([[maybe_unused]] auto &[Obj, Stores] : To.Stores) {
-      addDependencesBetweenSUs(From.UnknownStores, Stores);
-      addDependencesBetweenSUs(From.UnknownLoads, Stores);
-    }
-    for ([[maybe_unused]] auto &[Obj, Loads] : To.Loads)
-      addDependencesBetweenSUs(From.UnknownStores, Loads);
-    addDependencesBetweenSUs(From.UnknownStores, To.UnknownStores);
-    addDependencesBetweenSUs(From.UnknownStores, To.UnknownLoads);
-    addDependencesBetweenSUs(From.UnknownLoads, To.UnknownStores);
+    for (const SUnitWithMemInfo &Src : From.Loads)
+      for (const SUnitWithMemInfo &Dst : To.Stores)
+        addDependencesBetweenSUs(Src, Dst);
   }
 
   void updateLoadStoreChunk(SUnit *SU, LoadStoreChunk &Chunk) {
     const MachineInstr *MI = SU->getInstr();
     if (!MI->mayLoadOrStore())
       return;
-    SmallVector<const Value *, 4> Objs;
-    getUnderlyingObjectsForInstr(MI, Objs);
-    for (auto &Obj : Objs) {
-      if (Obj == UnknownValue) {
-        Objs.clear();
-        break;
-      }
-    }
-
-    if (Objs.empty()) {
-      (MI->mayStore() ? Chunk.UnknownStores : Chunk.UnknownLoads).push_back(SU);
-    } else {
-      auto &Map = (MI->mayStore() ? Chunk.Stores : Chunk.Loads);
-      for (const auto *Obj : Objs)
-        Map[Obj].push_back(SU);
-    }
+    (MI->mayStore() ? Chunk.Stores : Chunk.Loads).emplace_back(SU);
   }
 
   void addLoopCarriedDependencies() {
diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir b/llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir
index b42f98c44845e..84bf4597edf2a 100644
--- a/llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-carried-dep1.mir
@@ -6,12 +6,6 @@
 # CHECK:      Overlap check:
 # CHECK-NEXT:   BaseMI:   S2_storerh_io %12:intregs, 0, %18:intregs :: (store (s16) into %ir.lsr.iv24)
 # CHECK-NEXT:     Base + 0 + I * 4, Len: 2
-# CHECK-NEXT:   OtherMI:   S2_storerh_io %12:intregs, 0, %18:intregs :: (store (s16) into %ir.lsr.iv24)
-# CHECK-NEXT:     Base + 0 + I * 4, Len: 2
-# CHECK-NEXT:   Result: No overlap
-# CHECK-NEXT: Overlap check:
-# CHECK-NEXT:   BaseMI:   S2_storerh_io %12:intregs, 0, %18:intregs :: (store (s16) into %ir.lsr.iv24)
-# CHECK-NEXT:     Base + 0 + I * 4, Len: 2
 # CHECK-NEXT:   OtherMI:   %17:intregs = L2_loadrh_io %12:intregs, -8 :: (load (s16) from %ir.cgep10)
 # CHECK-NEXT:     Base + -8 + I * 4, Len: 2
 # CHECK-NEXT:   Result: Overlap
diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir b/llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir
index cd5fb06ec411d..413e541ed94b8 100644
--- a/llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-carried-dep2.mir
@@ -7,12 +7,6 @@
 # CHECK:      Overlap check:
 # CHECK-NEXT:   BaseMI:   S2_storeri_io %2:intregs, 0, %7:intregs :: (store (s32) into %ir.lsr.iv1)
 # CHECK-NEXT:     Base + 0 + I * -4, Len: 4
-# CHECK-NEXT:   OtherMI:   S2_storeri_io %2:intregs, 0, %7:intregs :: (store (s32) into %ir.lsr.iv1)
-# CHECK-NEXT:     Base + 0 + I * -4, Len: 4
-# CHECK-NEXT:   Result: No overlap
-# CHECK-NEXT: Overlap check:
-# CHECK-NEXT:   BaseMI:   S2_storeri_io %2:intregs, 0, %7:intregs :: (store (s32) into %ir.lsr.iv1)
-# CHECK-NEXT:     Base + 0 + I * -4, Len: 4
 # CHECK-NEXT:   OtherMI:   %5:intregs = L2_loadri_io %2:intregs, -8 :: (load (s32) from %ir.cgep)
 # CHECK-NEXT:     Base + -8 + I * -4, Len: 4
 # CHECK-NEXT:   Result: No overlap
diff --git a/llvm/test/CodeGen/Hexagon/swp-carried-dep3.mir b/llvm/test/CodeGen/Hexagon/swp-carried-dep3.mir
index f47cafdde4731..3976b60ffab5b 100644
--- a/llvm/test/CodeGen/Hexagon/swp-carried-dep3.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-carried-dep3.mir
@@ -10,12 +10,6 @@
 # CHECK:      Overlap check:
 # CHECK-NEXT:   BaseMI:   %13:intregs = S2_storerh_pi %12:intregs(tied-def 0), 2, %20:intregs :: (store (s16))
 # CHECK-NEXT:     Base + 0 + I * 2, Len: 2
-# CHECK-NEXT:   OtherMI:   %13:intregs = S2_storerh_pi %12:intregs(tied-def 0), 2, %20:intregs :: (store (s16))
-# CHECK-NEXT:     Base + 0 + I * 2, Len: 2
-# CHECK-NEXT:   Result: No overlap
-# CHECK-NEXT: Overlap check:
-# CHECK-NEXT:   BaseMI:   %13:intregs = S2_storerh_pi %12:intregs(tied-def 0), 2, %20:intregs :: (store (s16))
-# CHECK-NEXT:     Base + 0 + I * 2, Len: 2
 # CHECK-NEXT:   OtherMI:   %19:intregs, %15:intregs = L2_loadrh_pi %14:intregs(tied-def 1), 2 :: (load (s16))
 # CHECK-NEXT:     Base + 0 + I * 2, Len: 2
 # CHECK-NEXT:   Result: No overlap
diff --git a/llvm/test/CodeGen/Hexagon/swp-epilog-phi13.ll b/llvm/test/CodeGen/Hexagon/swp-epilog-phi13.ll
index 82a1067becede..a75bdca109db9 100644
--- a/llvm/test/CodeGen/Hexagon/swp-epilog-phi13.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-epilog-phi13.ll
@@ -10,7 +10,7 @@
 ; CHECK: endloop0
 
 ; Function Attrs: nounwind
-define ptr @f0(ptr nocapture readonly %a0, i32 %a1, i32 %a2, i32 %a3,  ptr %b) #0 {
+define ptr @f0(ptr nocapture readonly %a0, i32 %a1, i32 %a2, i32 %a3,  ptr %b, ptr %c2, ptr %c3) #0 {
 b0:
   br i1 undef, label %b1, label %b3
 
@@ -20,8 +20,8 @@ b1:                                               ; preds = %b0
 b2:                                               ; preds = %b2, %b1
   %v1 = phi ptr [ %a0, %b1 ], [ %v2, %b2 ]
   %v2 = phi ptr [ undef, %b1 ], [ %v15, %b2 ]
-  %v3 = phi ptr [ null, %b1 ], [ %v4, %b2 ]
-  %v4 = phi ptr [ null, %b1 ], [ %v14, %b2 ]
+  %v3 = phi ptr [ %c2, %b1 ], [ %v4, %b2 ]
+  %v4 = phi ptr [ %c3, %b1 ], [ %v14, %b2 ]
   %v5 = phi i32 [ 0, %b1 ], [ %v13, %b2 ]
   %v6 = phi ptr [ undef, %b1 ], [ %v12, %b2 ]
   %a = mul i32 %v5, %a2
diff --git a/llvm/test/CodeGen/Hexagon/swp-no-alias.mir b/llvm/test/CodeGen/Hexagon/swp-no-alias.mir
new file mode 100644
index 0000000000000..58b245911b25f
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-no-alias.mir
@@ -0,0 +1,88 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# CHECK: Loop Carried Edges:
+# CHECK-NEXT: calculateResMII:
+
+--- |
+  define void @foo(ptr noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out, i32 noundef %width) local_unnamed_addr #0 {
+  entry:
+    %cmp7 = icmp sgt i32 %width, 0
+    br i1 %cmp7, label %for.body.preheader, label %for.end
+  
+  for.body.preheader:                               ; preds = %entry
+    %0 = add i32 %width, 128
+    br label %for.body
+  
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %lsr.iv = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %optr.010 = phi ptr [ %cgep4, %for.body ], [ %out, %for.body.preheader ]
+    %iptr.09 = phi ptr [ %cgep5, %for.body ], [ %in, %for.body.preheader ]
+    %ald = load <128 x i8>, ptr %iptr.09, align 128, !tbaa !4
+    %cst = bitcast <128 x i8> %ald to <32 x i32>
+    store <32 x i32> %cst, ptr %optr.010, align 128, !tbaa !4
+    %cgep = getelementptr i8, ptr %iptr.09, i32 128
+    %ald1 = load <128 x i8>, ptr %cgep, align 128, !tbaa !4
+    %cst2 = bitcast <128 x i8> %ald1 to <32 x i32>
+    %cgep3 = getelementptr i8, ptr %optr.010, i32 128
+    store <32 x i32> %cst2, ptr %cgep3, align 128, !tbaa !4
+    %lsr.iv.next = add i32 %lsr.iv, -128
+    %cmp = icmp samesign ugt i32 %lsr.iv.next, 128
+    %cgep4 = getelementptr i8, ptr %optr.010, i32 256
+    %cgep5 = getelementptr i8, ptr %iptr.09, i32 256
+    br i1 %cmp, label %for.body, label %for.end, !llvm.loop !7
+  
+  for.end:                                          ; preds = %for.body, %entry
+    ret void
+  }
+  
+  attributes #0 = { "target-cpu"="hexagonv60" "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" }
+
+  !4 = !{!5, !5, i64 0}
+  !5 = !{!"omnipotent char", !6, i64 0}
+  !6 = !{!"Simple C/C++ TBAA"}
+  !7 = distinct !{!7, !8, !9}
+  !8 = !{!"llvm.loop.mustprogress"}
+  !9 = !{!"llvm.loop.unroll.disable"}
+...
+---
+name:            foo
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $r0, $r1, $r2
+  
+    %9:intregs = COPY $r2
+    %8:intregs = COPY $r1
+    %7:intregs = COPY $r0
+    %10:predregs = C2_cmpgti %9, 0
+    J2_jumpf %10, %bb.3, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+  
+    %0:intregs = A2_addi %9, 128
+    %15:intregs = A2_addi %0, -1
+    %16:intregs = S2_lsr_i_r %15, 7
+    %17:intregs = COPY %16
+    J2_loop0r %bb.2, %17, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.2.for.body (machine-block-address-taken):
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  
+    %2:intregs = PHI %8, %bb.1, %5, %bb.2
+    %3:intregs = PHI %7, %bb.1, %6, %bb.2
+    %12:hvxvr = V6_vL32b_ai %3, 0 :: (load (s1024) from %ir.iptr.09, !tbaa !4)
+    V6_vS32b_ai %2, 0, killed %12 :: (store (s1024) into %ir.optr.010, !tbaa !4)
+    %13:hvxvr = V6_vL32b_ai %3, 128 :: (load (s1024) from %ir.cgep, !tbaa !4)
+    V6_vS32b_ai %2, 128, killed %13 :: (store (s1024) into %ir.cgep3, !tbaa !4)
+    %5:intregs = A2_addi %2, 256
+    %6:intregs = A2_addi %3, 256
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.3.for.end:
+    PS_jmpret $r31, implicit-def dead $pc
+...

>From 0b3cc4c67f7a9a3b04f8d01c1c680a77629bfd16 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Fri, 18 Apr 2025 11:41:03 +0000
Subject: [PATCH 3/6] Fix output dependencies

---
 llvm/lib/CodeGen/MachinePipeliner.cpp | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 8f255717d0b56..7ba4925d086d9 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1861,10 +1861,24 @@ unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
     const SwingSchedulerDDG *DDG) {
   BitVector Added(SUnits.size());
-  for (int I = 0, E = SUnits.size(); I != E; ++I) {
+  DenseMap<int, int> OutputDeps;
+  for (int i = 0, e = SUnits.size(); i != e; ++i) {
     Added.reset();
     // Add any successor to the adjacency matrix and exclude duplicates.
-    for (const auto &OE : DDG->getOutEdges(&SUnits[I])) {
+    for (const auto &OE : DDG->getOutEdges(&SUnits[i])) {
+      // Only create a back-edge on the first and last nodes of a dependence
+      // chain. This records any chains and adds them later.
+      if (OE.isOutputDep()) {
+        int N = OE.getDst()->NodeNum;
+        int BackEdge = i;
+        auto Dep = OutputDeps.find(BackEdge);
+        if (Dep != OutputDeps.end()) {
+          BackEdge = Dep->second;
+          OutputDeps.erase(Dep);
+        }
+        OutputDeps[N] = BackEdge;
+      }
+
       // Do not process a boundary node, an artificial node.
       if (OE.getDst()->isBoundaryNode() || OE.isArtificial())
         continue;
@@ -1890,11 +1904,18 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
       int N = OE.getDst()->NodeNum;
 
       if (!Added.test(N)) {
-        AdjK[I].push_back(N);
+        AdjK[i].push_back(N);
         Added.set(N);
       }
     }
   }
+
+  // Add back-edges in the adjacency matrix for the output dependences.
+  for (auto &OD : OutputDeps)
+    if (!Added.test(OD.second)) {
+      AdjK[OD.first].push_back(OD.second);
+      Added.set(OD.second);
+    }
 }
 
 /// Identify an elementary circuit in the dependence graph starting at the

>From a77280749302666c50df7722510c8c03ef1afe0f Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Fri, 18 Apr 2025 12:39:08 +0000
Subject: [PATCH 4/6] Add alias check for MemOperand

---
 llvm/lib/CodeGen/MachinePipeliner.cpp | 46 ++++++++++++++++-----------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 7ba4925d086d9..8270948991bc1 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -822,23 +822,6 @@ static bool isGlobalMemoryObject(MachineInstr &MI) {
          (MI.hasOrderedMemoryRef() && !MI.isDereferenceableInvariantLoad());
 }
 
-/// Return the underlying objects for the memory references of an instruction.
-/// This function calls the code in ValueTracking, but first checks that the
-/// instruction has a memory operand.
-static bool getUnderlyingObjectsForInstr(const MachineInstr *MI,
-                                         SmallVectorImpl<const Value *> &Objs,
-                                         AAMDNodes *AA) {
-  *AA = AAMDNodes();
-  if (!MI->hasOneMemOperand())
-    return false;
-  MachineMemOperand *MM = *MI->memoperands_begin();
-  if (!MM->getValue())
-    return false;
-  getUnderlyingObjects(MM->getValue(), Objs);
-  *AA = MM->getAAInfo();
-  return true;
-}
-
 // static std::optional<MemoryLocation>
 // getMemoryLocationForAA(const MachineInstr *MI, const Value *Val) {
 //   const MachineMemOperand *MMO = *MI->memoperands_begin();
@@ -1483,6 +1466,7 @@ class HighRegisterPressureDetector {
 struct SUnitWithMemInfo {
   SUnit *SU;
   SmallVector<const Value *, 2> Objs;
+  const Value *MMOValue = nullptr;
   AAMDNodes AATags;
   bool IsAllIdentified = false;
   bool IsUnknown = true;
@@ -1500,9 +1484,8 @@ struct SUnitWithMemInfo {
 
 private:
   void init() {
-    if (!getUnderlyingObjectsForInstr(SU->getInstr(), Objs, &AATags))
+    if (!getUnderlyingObjects())
       return;
-
     IsUnknown = false;
     for (const Value *Obj : Objs)
       if (!isIdentifiedObject(Obj)) {
@@ -1510,6 +1493,21 @@ struct SUnitWithMemInfo {
         break;
       }
   }
+  /// Return the underlying objects for the memory references of an instruction.
+  /// This function calls the code in ValueTracking, but first checks that the
+  /// instruction has a memory operand.
+  bool getUnderlyingObjects() {
+    const MachineInstr *MI = SU->getInstr();
+    if (!MI->hasOneMemOperand())
+      return false;
+    MachineMemOperand *MM = *MI->memoperands_begin();
+    if (!MM->getValue())
+      return false;
+    MMOValue = MM->getValue();
+    ::getUnderlyingObjects(MMOValue, Objs);
+    AATags = MM->getAAInfo();
+    return true;
+  }
 };
 
 /// Add loop-carried chain dependencies. This class handles the same type of
@@ -1679,6 +1677,16 @@ class LoopCarriedOrderDepsTracker {
     if (!BAA || Src.IsUnknown || Dst.IsUnknown)
       return true;
 
+    // TODO: Correct?
+    if (Src.MMOValue && Dst.MMOValue) {
+      const auto SrcLoc =
+          MemoryLocation::getBeforeOrAfter(Src.MMOValue, Src.AATags);
+      const auto DstLoc =
+          MemoryLocation::getBeforeOrAfter(Dst.MMOValue, Dst.AATags);
+      if (BAA->isNoAlias(SrcLoc, DstLoc))
+        return false;
+    }
+
     for (const Value *SrcObj : Src.Objs)
       for (const Value *DstObj : Dst.Objs) {
         const auto SrcLoc =

>From d4d6d99ba393d67ee74bb42f1ca65773b0f69ba0 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Fri, 18 Apr 2025 12:43:29 +0000
Subject: [PATCH 5/6] Revert unnecesasry chenges for tests

---
 llvm/test/CodeGen/Hexagon/vect/vect-shuffle.ll | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/Hexagon/vect/vect-shuffle.ll b/llvm/test/CodeGen/Hexagon/vect/vect-shuffle.ll
index c41d7b2a4e3ec..09746b30a4c18 100644
--- a/llvm/test/CodeGen/Hexagon/vect/vect-shuffle.ll
+++ b/llvm/test/CodeGen/Hexagon/vect/vect-shuffle.ll
@@ -1,13 +1,12 @@
 ; REQUIRES: asserts
 ; RUN: llc -mtriple=hexagon -mcpu=hexagonv5 -disable-hsdr --stats -o - 2>&1 < %s | FileCheck %s
 
-; NOTE: Node order issues are reported by pipelier, but they are false positives.
-
 ; Check that store is post-incremented.
 ; CHECK-NOT: extractu(r{{[0-9]+}},#32,
 ; CHECK-NOT: insert
-; CHECK: Number of node order issues found
+; CHECK-NOT: Number of node order issues found
 ; CHECK: Number of loops software pipelined
+; CHECK-NOT: Number of node order issues found
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
 target triple = "hexagon"
 

>From 0189145ccf56682fb3e57d13008619d33e810398 Mon Sep 17 00:00:00 2001
From: Ryotaro Kasuga <kasuga.ryotaro at fujitsu.com>
Date: Mon, 21 Apr 2025 10:16:38 +0000
Subject: [PATCH 6/6] Clean up the code

---
 llvm/include/llvm/CodeGen/MachinePipeliner.h |  59 +-
 llvm/lib/CodeGen/MachinePipeliner.cpp        | 665 ++++++++++---------
 2 files changed, 389 insertions(+), 335 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 2182cd9a731c6..12ca9c8e62b1f 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -193,9 +193,16 @@ class SwingSchedulerDDGEdge {
   /// recursion in the calculation of the ASAP, ALAP, etc functions.
   bool ignoreDependence(bool IgnoreAnti) const;
 
+  /// Retruns true if this edge is assumed to be used only for validation of a
+  /// schedule. That is, this edge would not be considered when computing a
+  /// schedule.
   bool isValidationOnly() const { return IsValidationOnly; }
 };
 
+/// Represents loop-carried dependencies. Because SwingSchedulerDAG doesn't
+/// assume cycle dependencies as the name suggests, such dependencies must be
+/// handled separately. After DAG construction is finished, these dependencies
+/// are added to SwingSchedulerDDG.
 struct LoopCarriedEdges {
   using OutputDep = SmallDenseMap<Register, SmallSetVector<SUnit *, 4>>;
   using OrderDep = SmallSetVector<SUnit *, 8>;
@@ -206,6 +213,7 @@ struct LoopCarriedEdges {
   OrderDepsType OrderDeps;
 
 private:
+  /// Backedges that should be used when searching a schedule.
   DenseMap<const SUnit *, SmallPtrSet<const SUnit *, 4>> BackEdges;
 
 public:
@@ -223,16 +231,15 @@ struct LoopCarriedEdges {
     return &Ite->second;
   }
 
-  bool shouldAddBackEdge(const SUnit *From, const SUnit *To) const {
-    if (From->NodeNum < To->NodeNum)
-      return false;
-    auto Ite = BackEdges.find(From);
-    if (Ite == BackEdges.end())
-      return false;
-    return Ite->second.contains(To);
-  }
+  /// Retruns true if the edge from \p From to \p To is a back-edge that should
+  /// be used when scheduling.
+  bool shouldUseWhenScheduling(const SUnit *From, const SUnit *To) const;
 
-  void modifySUnits(std::vector<SUnit> &SUnits);
+  /// Adds some edges to the original DAG that correspond to loop-carried
+  /// dependencies. Historically, loop-carried edges are represented by using
+  /// non-loop-carried edges in the original DAG. This function appends such
+  /// edges to preserve the previous behavior.
+  void modifySUnits(std::vector<SUnit> &SUnits, const TargetInstrInfo *TII);
 
   void dump(SUnit *SU, const TargetRegisterInfo *TRI,
             const MachineRegisterInfo *MRI) const;
@@ -240,23 +247,23 @@ struct LoopCarriedEdges {
 
 /// Represents dependencies between instructions. This class is a wrapper of
 /// `SUnits` and its dependencies to manipulate back-edges in a natural way.
-/// Currently it only supports back-edges via PHI, which are expressed as
-/// anti-dependencies in the original DAG.
-/// FIXME: Support any other loop-carried dependencies
 class SwingSchedulerDDG {
   class EdgesType {
-    SmallVector<SwingSchedulerDDGEdge, 4> Underlying;
+    /// The number of loop-carried edges in Underlying.
     unsigned LoopCarriedOrderDepsCount = 0;
 
+    /// Hold edges. There is a restriction on the order of the edges. Let N be
+    /// the number of edges, then
+    /// - The first #(N - LoopCarriedOrderDepsCount) edges are not
+    ///   loop-carried.
+    /// - The last #LoopCarriedOrderDepsCount edges are loop-carried.
+    SmallVector<SwingSchedulerDDGEdge, 4> Underlying;
+
   public:
-    void append(const SwingSchedulerDDGEdge &Edge) {
-      bool LoopCarriedOrderDep = Edge.isOrderDep() && Edge.getDistance() != 0;
-      assert(!(LoopCarriedOrderDepsCount != 0 && !LoopCarriedOrderDep) &&
-             "Loop-carried edges should not be added to the underlying edges");
-      Underlying.push_back(Edge);
-      if (LoopCarriedOrderDep)
-        ++LoopCarriedOrderDepsCount;
-    }
+    /// Add an \p Edge. To satisfy the order restriction on Underlying, once a
+    /// loop-carried edge is added, an edge that is not loop-carried one must
+    /// not be added.
+    void append(const SwingSchedulerDDGEdge &Edge);
 
     ArrayRef<SwingSchedulerDDGEdge> get(bool UseLoopCarriedEdges) const {
       ArrayRef<SwingSchedulerDDGEdge> Res = Underlying;
@@ -292,14 +299,21 @@ class SwingSchedulerDDG {
   SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU, SUnit *ExitSU,
                     const LoopCarriedEdges &LCE);
 
+  /// Get in-edges for \p SU.
   ArrayRef<SwingSchedulerDDGEdge> getInEdges(const SUnit *SU) const;
 
+  /// Get out-edges for \p SU.
   ArrayRef<SwingSchedulerDDGEdge> getOutEdges(const SUnit *SU) const;
 
+  /// Returns true if \p Schedule doesn't violate the validation-only
+  /// dependencies.
   bool isValidSchedule(std::vector<SUnit> &SUnits,
                        const SMSchedule &Schedule) const;
 
+  /// Include loop-carried edeges to the result of getInEdges/getOutEdges.
   void applyLoopCarriedEdges() { UseLoopCarriedEdges = true; }
+
+  /// Exclude loop-carried edeges from the result of getInEdges/getOutEdges.
   void removeLoopCarriedEdges() { UseLoopCarriedEdges = false; }
 };
 
@@ -357,7 +371,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
 
   /// Helper class to implement Johnson's circuit finding algorithm.
-  struct Circuits {
+  class Circuits {
     std::vector<SUnit> &SUnits;
     SetVector<SUnit *> Stack;
     BitVector Blocked;
@@ -366,6 +380,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
     // Node to Index from ScheduleDAGTopologicalSort
     std::vector<int> *Node2Idx;
     unsigned NumPaths = 0u;
+    static unsigned MaxPaths;
 
   public:
     Circuits(std::vector<SUnit> &SUs, ScheduleDAGTopologicalSort &Topo)
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 8270948991bc1..1240b0fa89b0b 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -194,10 +194,6 @@ static cl::opt<bool>
     MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(false),
                cl::desc("Use the MVE code generator for software pipelining"));
 
-static cl::opt<unsigned> MaxCircuitPaths(
-    "pipeliner-max-circuit-paths", cl::Hidden, cl::init(5),
-    cl::desc("Maximum number of circles to be detected for each vertex"));
-
 namespace llvm {
 
 // A command line option to enable the CopyToPhi DAG mutation.
@@ -225,6 +221,7 @@ static cl::opt<WindowSchedulingFlag> WindowSchedulingOption(
 
 } // end namespace llvm
 
+unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
 char MachinePipeliner::ID = 0;
 #ifndef NDEBUG
 int MachinePipeliner::NumTries = 0;
@@ -566,7 +563,7 @@ void SwingSchedulerDAG::schedule() {
   AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
   buildSchedGraph(AA);
   auto LCE = addLoopCarriedDependences(AA);
-  LCE.modifySUnits(SUnits);
+  LCE.modifySUnits(SUnits, TII);
   updatePhiDependences();
   Topo.InitDAGTopologicalSorting();
   changeDependences();
@@ -581,10 +578,14 @@ void SwingSchedulerDAG::schedule() {
 
   DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU, LCE);
 
+  // Consider loop-carried edges when finding circuits.
   DDG->applyLoopCarriedEdges();
+
   NodeSetType NodeSets;
   findCircuits(NodeSets);
   NodeSetType Circuits = NodeSets;
+
+  // Ignore loop-carried edges when determinating the node order.
   DDG->removeLoopCarriedEdges();
 
   // Calculate the MII.
@@ -664,6 +665,7 @@ void SwingSchedulerDAG::schedule() {
   // check for node order issues
   checkValidNodeOrder(Circuits);
 
+  // Take into account loop-carried edges when scheduling.
   DDG->applyLoopCarriedEdges();
   SMSchedule Schedule(Pass.MF, this);
   Scheduled = schedulePipeline(Schedule);
@@ -815,31 +817,26 @@ static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
   return false;
 }
 
-/// Return true if the instruction causes a chain between memory
-/// references before and after it.
-static bool isGlobalMemoryObject(MachineInstr &MI) {
-  return MI.isCall() || MI.hasUnmodeledSideEffects() ||
-         (MI.hasOrderedMemoryRef() && !MI.isDereferenceableInvariantLoad());
-}
+/// Collect the underlying objects for the memory references of an instruction.
+/// This function calls the code in ValueTracking, but first checks that the
+/// instruction has a memory operand.
+/// Returns false if we cannot find the underlying objects.
+bool getUnderlyingObjects(const MachineInstr *MI,
+                          SmallVectorImpl<const Value *> &Objs,
+                          const Value *&MMOValue, AAMDNodes &AATags) {
+  if (!MI->hasOneMemOperand())
+    return false;
+  MachineMemOperand *MM = *MI->memoperands_begin();
+  if (!MM->getValue())
+    return false;
+  MMOValue = MM->getValue();
+  getUnderlyingObjects(MMOValue, Objs);
 
-// static std::optional<MemoryLocation>
-// getMemoryLocationForAA(const MachineInstr *MI, const Value *Val) {
-//   const MachineMemOperand *MMO = *MI->memoperands_begin();
-//   const Value *Val = MMO->getValue();
-//   if (!Val)
-//     return std::nullopt;
-//   auto MemLoc = MemoryLocation::getBeforeOrAfter(Val, MMO->getAAInfo());
-//
-//   // Peel off noalias information from `AATags` because it might be valid
-//   only
-//   // in single iteration.
-//   // FIXME: This is too conservative. Checking
-//   // `llvm.experimental.noalias.scope.decl` instrinsics in the original LLVM
-//   IR
-//   // can perform more accuurately.
-//   // MemLoc.AATags.NoAlias = nullptr;
-//   return MemLoc;
-// }
+  // TODO: A no alias scope may be valid only in a single iteration. In this
+  // case we need to peel off it like LoopAccessAnalysis does.
+  AATags = MM->getAAInfo();
+  return true;
+}
 
 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
 /// processes dependences for PHIs. This function adds true dependences
@@ -1465,49 +1462,16 @@ class HighRegisterPressureDetector {
 
 struct SUnitWithMemInfo {
   SUnit *SU;
-  SmallVector<const Value *, 2> Objs;
-  const Value *MMOValue = nullptr;
+  SmallVector<const Value *, 2> UnderlyingObjs;
+  const Value *MemOpValue = nullptr;
   AAMDNodes AATags;
   bool IsAllIdentified = false;
-  bool IsUnknown = true;
 
-  SUnitWithMemInfo(SUnit *SU) : SU(SU) { init(); }
+  SUnitWithMemInfo(SUnit *SU);
 
-  bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const {
-    if (!IsAllIdentified || !Other.IsAllIdentified)
-      return false;
-    for (const Value *Obj : Objs)
-      if (llvm::is_contained(Other.Objs, Obj))
-        return false;
-    return true;
-  }
+  bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
 
-private:
-  void init() {
-    if (!getUnderlyingObjects())
-      return;
-    IsUnknown = false;
-    for (const Value *Obj : Objs)
-      if (!isIdentifiedObject(Obj)) {
-        IsAllIdentified = false;
-        break;
-      }
-  }
-  /// Return the underlying objects for the memory references of an instruction.
-  /// This function calls the code in ValueTracking, but first checks that the
-  /// instruction has a memory operand.
-  bool getUnderlyingObjects() {
-    const MachineInstr *MI = SU->getInstr();
-    if (!MI->hasOneMemOperand())
-      return false;
-    MachineMemOperand *MM = *MI->memoperands_begin();
-    if (!MM->getValue())
-      return false;
-    MMOValue = MM->getValue();
-    ::getUnderlyingObjects(MMOValue, Objs);
-    AATags = MM->getAAInfo();
-    return true;
-  }
+  bool isUnknown() const { return MemOpValue == nullptr; }
 };
 
 /// Add loop-carried chain dependencies. This class handles the same type of
@@ -1516,18 +1480,11 @@ struct SUnitWithMemInfo {
 class LoopCarriedOrderDepsTracker {
   // Type of instruction that is relevant to order-dependencies
   enum class InstrTag {
-    // Instruction related to global memory objects. There are order
-    // dependencies between instructions that may load or store or raise
-    // floating-point exception before and after this one.
-    GlobalMemoryObject = 0,
-
-    // Instruction that may load or store memory, but does not form a global
-    // barrier.
-    LoadOrStore = 1,
-
-    // Instruction that does not match above, but may raise floatin-point
-    // exceptions.
-    FPExceptions = 2,
+    Barrier = 0,      ///< A barrier event instruction.
+    LoadOrStore = 1,  ///< An instruction that may load or store memory, but is
+                      ///< not a barrier event.
+    FPExceptions = 2, ///< An instruction that does not match above, but may
+                      ///< raise floatin-point exceptions.
   };
 
   struct TaggedSUnit : PointerIntPair<SUnit *, 2> {
@@ -1537,265 +1494,329 @@ class LoopCarriedOrderDepsTracker {
     InstrTag getTag() const { return InstrTag(getInt()); }
   };
 
-  using SUsType = SmallVector<SUnit *, 4>;
-  using Value2SUs = MapVector<const Value *, SUsType>;
-
-  // Retains loads and stores classified by the underlying objects.
+  /// Holds loads and stores with memory related information.
   struct LoadStoreChunk {
     SmallVector<SUnitWithMemInfo, 4> Loads;
     SmallVector<SUnitWithMemInfo, 4> Stores;
+
+    void append(SUnit *SU);
   };
 
   SwingSchedulerDAG *DAG;
   std::unique_ptr<BatchAAResults> BAA;
   std::vector<SUnit> &SUnits;
 
-  // The size of SUnits, for convenience.
+  /// The size of SUnits, for convenience.
   const unsigned N;
 
-  // Adjacency matrix consisiting of order dependencies of the original DAG.
+  /// Adjacency matrix consisiting of order dependencies of the original DAG.
   std::vector<BitVector> AdjMatrix;
 
-  // Loop-carried Edges.
+  /// Loop-carried Edges.
   std::vector<BitVector> LoopCarried;
 
-  // Instructions related to chain dependencies. They are one of the following.
-  //
-  //   1. Global memory object.
-  //   2. Load, but not a global memory object, not invariant, or may load trap
-  //      value.
-  //   3. Store, but not global memory object.
-  //   4. None of them, but may raise floating-point exceptions.
-  //
-  // This is used when analyzing loop-carried dependencies that access global
-  // barrier instructions.
+  /// Instructions related to chain dependencies. They are one of the
+  /// following:
+  ///
+  ///  1. Barrier event.
+  ///  2. Load, but neither a barrier event, invariant load, nor may load trap
+  ///     value.
+  ///  3. Store, but not a barrier event.
+  ///  4. None of them, but may raise floating-point exceptions.
+  ///
+  /// This is used when analyzing loop-carried dependencies that access global
+  /// barrier instructions.
   std::vector<TaggedSUnit> TaggedSUnits;
 
+  const TargetInstrInfo *TII = nullptr;
+
 public:
-  LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, AAResults *AA)
-      : DAG(SSD), BAA(nullptr), SUnits(DAG->SUnits), N(SUnits.size()),
-        AdjMatrix(N, BitVector(N)), LoopCarried(N, BitVector(N)) {
-    if (AA) {
-      BAA = std::make_unique<BatchAAResults>(*AA);
-      BAA->enableCrossIterationMode();
-    }
-    initAdjMatrix();
+  LoopCarriedOrderDepsTracker(SwingSchedulerDAG *SSD, AAResults *AA,
+                              const TargetInstrInfo *TII);
+
+  /// The main function to compute loop-carried order-dependencies.
+  void computeDependencies();
+
+  const BitVector &getLoopCarried(unsigned Idx) const {
+    return LoopCarried[Idx];
   }
 
-  void computeDependencies() {
-    // Traverse all instructions and extract only what we are targetting.
-    for (auto &SU : SUnits) {
-      auto Tagged = checkInstrType(&SU);
+private:
+  /// Calculate reachability induced by the original DAG.
+  void initAdjMatrix();
 
-      // This instruction has no loop-carried order-dependencies.
-      if (!Tagged)
-        continue;
+  /// Tags to \p SU if the instruction may affect the order-dependencies.
+  std::optional<TaggedSUnit> checkInstrType(SUnit *SU) const;
 
-      TaggedSUnits.push_back(*Tagged);
-    }
+  /// Retruns true if there is a loop-carried dependence between \p Src and \p
+  /// Dst.
+  bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
+                            const SUnitWithMemInfo &Dst) const;
 
-    addLoopCarriedDependencies();
+  /// Add a loop-carried dependency between \p From and \p To if it exists.
+  void addDependencesBetweenSUs(const SUnitWithMemInfo &From,
+                                const SUnitWithMemInfo &To);
+
+  /// Add loop-carried dependencies between nodes in \p From and \p To.
+  void addDependencesBetweenChunks(const LoadStoreChunk &From,
+                                   const LoadStoreChunk &To);
+
+  void computeDependenciesAux();
+};
 
-    // Finalize the results.
-    for (int I = 0; I != int(N); I++) {
-      // If the dependence between two instructions already exists in the
-      // original DAG, then loop-carried dependence of the same instructions is
-      // unnecessary because the original one expresses stricter
-      // constraint than loop-carried one.
-      LoopCarried[I].reset(AdjMatrix[I]);
+} // end anonymous namespace
 
-      // Self-loops are noisy.
-      LoopCarried[I].reset(I);
+SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) {
+  if (!getUnderlyingObjects(SU->getInstr(), UnderlyingObjs, MemOpValue, AATags))
+    return;
+  for (const Value *Obj : UnderlyingObjs)
+    if (!isIdentifiedObject(Obj)) {
+      IsAllIdentified = false;
+      break;
     }
-  }
+}
 
-  const BitVector &getLoopCarried(unsigned Idx) const {
-    return LoopCarried[Idx];
+bool SUnitWithMemInfo::isTriviallyDisjoint(
+    const SUnitWithMemInfo &Other) const {
+  // If all underlying objects are identified objects and there is no overlap
+  // between them, then these two instructions are disjoint.
+  if (!IsAllIdentified || !Other.IsAllIdentified)
+    return false;
+  for (const Value *Obj : UnderlyingObjs)
+    if (llvm::is_contained(Other.UnderlyingObjs, Obj))
+      return false;
+  return true;
+}
+
+void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) {
+  const MachineInstr *MI = SU->getInstr();
+  if (!MI->mayLoadOrStore())
+    return;
+  (MI->mayStore() ? Stores : Loads).emplace_back(SU);
+}
+
+LoopCarriedOrderDepsTracker::LoopCarriedOrderDepsTracker(
+    SwingSchedulerDAG *SSD, AAResults *AA, const TargetInstrInfo *TII)
+    : DAG(SSD), BAA(nullptr), SUnits(DAG->SUnits), N(SUnits.size()),
+      AdjMatrix(N, BitVector(N)), LoopCarried(N, BitVector(N)), TII(TII) {
+  if (AA) {
+    BAA = std::make_unique<BatchAAResults>(*AA);
+    BAA->enableCrossIterationMode();
   }
+  initAdjMatrix();
+}
 
-private:
-  // Calculate reachability induced by the adjacency matrix. The original graph
-  // is DAG, so we can compute them from bottom to top.
-  void initAdjMatrix() {
-    for (int RI = 0; RI != int(N); RI++) {
-      int I = SUnits.size() - (RI + 1);
-      for (const auto &Succ : SUnits[I].Succs)
-        if (Succ.isNormalMemoryOrBarrier()) {
-          SUnit *SSU = Succ.getSUnit();
-          if (SSU->isBoundaryNode())
-            continue;
-          // `updatePhiDependences` may add barrier-dependencies between PHIs,
-          // which don't make sense in this case.
-          if (SSU->getInstr()->isPHI())
-            continue;
-          int J = SSU->NodeNum;
-          AdjMatrix[I].set(J);
-        }
-    }
+void LoopCarriedOrderDepsTracker::computeDependencies() {
+  // Traverse all instructions and extract only what we are targetting.
+  for (auto &SU : SUnits) {
+    auto Tagged = checkInstrType(&SU);
+
+    // This instruction has no loop-carried order-dependencies.
+    if (!Tagged)
+      continue;
+
+    TaggedSUnits.push_back(*Tagged);
   }
 
-  // Tags to \p SU if the instruction may affect the order-dependencies.
-  std::optional<TaggedSUnit> checkInstrType(SUnit *SU) const {
-    MachineInstr *MI = SU->getInstr();
-    if (isGlobalMemoryObject(*MI))
-      return TaggedSUnit(SU, InstrTag::GlobalMemoryObject);
+  computeDependenciesAux();
 
-    if (MI->mayStore() ||
-        (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
-      return TaggedSUnit(SU, InstrTag::LoadOrStore);
+  // Finalize the results.
+  for (int I = 0; I != int(N); I++) {
+    // If the dependence between two instructions already exists in the original
+    // DAG, then a loop-carried dependence for them is unnecessary because the
+    // original one expresses stricter constraint than loop-carried one.
+    LoopCarried[I].reset(AdjMatrix[I]);
 
-    if (MI->mayRaiseFPException())
-      return TaggedSUnit(SU, InstrTag::FPExceptions);
+    // Self-loops are noisy.
+    LoopCarried[I].reset(I);
+  }
+}
 
-    return std::nullopt;
+void LoopCarriedOrderDepsTracker::initAdjMatrix() {
+  // The original graph is DAG, so we can compute them from bottom to top.
+  for (int RI = 0; RI != int(N); RI++) {
+    int I = SUnits.size() - (RI + 1);
+    for (const auto &Succ : SUnits[I].Succs)
+      if (Succ.isNormalMemoryOrBarrier()) {
+        SUnit *SSU = Succ.getSUnit();
+        if (SSU->isBoundaryNode())
+          continue;
+        // `updatePhiDependences` may add barrier-dependencies between PHIs,
+        // which don't make sense in this case.
+        if (SSU->getInstr()->isPHI())
+          continue;
+        int J = SSU->NodeNum;
+        AdjMatrix[I].set(J);
+      }
   }
+}
 
-  bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
-                            const SUnitWithMemInfo &Dst) const {
-    if (!SwpPruneLoopCarried)
-      return true;
+std::optional<LoopCarriedOrderDepsTracker::TaggedSUnit>
+LoopCarriedOrderDepsTracker::checkInstrType(SUnit *SU) const {
+  MachineInstr *MI = SU->getInstr();
+  if (TII->isGlobalMemoryObject(MI))
+    return TaggedSUnit(SU, InstrTag::Barrier);
 
-    if (Src.isTriviallyDisjoint(Dst))
-      return false;
+  if (MI->mayStore() ||
+      (MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
+    return TaggedSUnit(SU, InstrTag::LoadOrStore);
 
-    // First, check the dependence by comparing base register, offset, and
-    // step value of the loop.
-    switch (
-        DAG->mayOverlapInLaterIter(Src.SU->getInstr(), Dst.SU->getInstr())) {
-    case AliasResult::Kind::MustAlias:
-      return true;
-    case AliasResult::Kind::NoAlias:
-      return false;
-    case AliasResult::Kind::MayAlias:
-      break;
-    default:
-      llvm_unreachable("Unexpected alias");
-    }
+  if (MI->mayRaiseFPException())
+    return TaggedSUnit(SU, InstrTag::FPExceptions);
 
-    // If we cannot determine the dependence by previouse check, then
-    // check by using alias analysis.
-    if (!BAA || Src.IsUnknown || Dst.IsUnknown)
-      return true;
+  return std::nullopt;
+}
 
-    // TODO: Correct?
-    if (Src.MMOValue && Dst.MMOValue) {
-      const auto SrcLoc =
-          MemoryLocation::getBeforeOrAfter(Src.MMOValue, Src.AATags);
-      const auto DstLoc =
-          MemoryLocation::getBeforeOrAfter(Dst.MMOValue, Dst.AATags);
-      if (BAA->isNoAlias(SrcLoc, DstLoc))
-        return false;
-    }
+bool LoopCarriedOrderDepsTracker::hasLoopCarriedMemDep(
+    const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst) const {
+  if (!SwpPruneLoopCarried)
+    return true;
 
-    for (const Value *SrcObj : Src.Objs)
-      for (const Value *DstObj : Dst.Objs) {
-        const auto SrcLoc =
-            MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags);
-        const auto DstLoc =
-            MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags);
-        if (!BAA->isNoAlias(SrcLoc, DstLoc))
-          return true;
-      }
+  if (Src.isTriviallyDisjoint(Dst))
+    return false;
 
+  // First, check the dependence by comparing base register, offset, and
+  // step value of the loop.
+  switch (DAG->mayOverlapInLaterIter(Src.SU->getInstr(), Dst.SU->getInstr())) {
+  case AliasResult::Kind::MustAlias:
+    return true;
+  case AliasResult::Kind::NoAlias:
     return false;
+  case AliasResult::Kind::MayAlias:
+    break;
+  default:
+    llvm_unreachable("Unexpected alias");
   }
 
-  void addDependencesBetweenSUs(const SUnitWithMemInfo &From,
-                                const SUnitWithMemInfo &To) {
-    if (From.SU == To.SU)
-      return;
-    if (hasLoopCarriedMemDep(From, To))
-      LoopCarried[From.SU->NodeNum].set(To.SU->NodeNum);
-  }
+  // If we cannot determine the dependence by previouse check, then
+  // check by using alias analysis.
 
-  void addDependencesBetweenChunks(const LoadStoreChunk &From,
-                                   const LoadStoreChunk &To) {
-    for (const SUnitWithMemInfo &Src : From.Stores)
-      for (const SUnitWithMemInfo &Dst : To.Stores)
-        addDependencesBetweenSUs(Src, Dst);
-
-    for (const SUnitWithMemInfo &Src : From.Stores)
-      for (const SUnitWithMemInfo &Dst : To.Loads)
-        addDependencesBetweenSUs(Src, Dst);
-
-    for (const SUnitWithMemInfo &Src : From.Loads)
-      for (const SUnitWithMemInfo &Dst : To.Stores)
-        addDependencesBetweenSUs(Src, Dst);
-  }
+  if (!BAA || Src.isUnknown() || Dst.isUnknown())
+    return true;
 
-  void updateLoadStoreChunk(SUnit *SU, LoadStoreChunk &Chunk) {
-    const MachineInstr *MI = SU->getInstr();
-    if (!MI->mayLoadOrStore())
-      return;
-    (MI->mayStore() ? Chunk.Stores : Chunk.Loads).emplace_back(SU);
+  // Query AliasAnalysis by using the value of the memory operand.
+  if (Src.MemOpValue && Dst.MemOpValue) {
+    const auto SrcLoc =
+        MemoryLocation::getBeforeOrAfter(Src.MemOpValue, Src.AATags);
+    const auto DstLoc =
+        MemoryLocation::getBeforeOrAfter(Dst.MemOpValue, Dst.AATags);
+    if (BAA->isNoAlias(SrcLoc, DstLoc))
+      return false;
   }
 
-  void addLoopCarriedDependencies() {
-    // Collect instructions until a first instruction for global memory object
-    // is found
-    LoadStoreChunk FirstChunk;
-    std::vector<SUnit *> FirstSUs;
-    SUnit *FirstBarrier = nullptr;
-    for (const auto &TSU : TaggedSUnits) {
-      SUnit *SU = TSU.getPointer();
-      FirstSUs.push_back(SU);
-      if (TSU.getTag() == InstrTag::GlobalMemoryObject) {
-        FirstBarrier = SU;
-        break;
-      }
-      updateLoadStoreChunk(SU, FirstChunk);
+  // Try all combinations of the underlying objects.
+  for (const Value *SrcObj : Src.UnderlyingObjs)
+    for (const Value *DstObj : Dst.UnderlyingObjs) {
+      const auto SrcLoc = MemoryLocation::getBeforeOrAfter(SrcObj, Src.AATags);
+      const auto DstLoc = MemoryLocation::getBeforeOrAfter(DstObj, Dst.AATags);
+      if (!BAA->isNoAlias(SrcLoc, DstLoc))
+        return true;
     }
 
-    // If there are no instructions related to global memory object, then check
-    // loop-carried dependencies for all load/store pairs.
-    if (FirstBarrier == nullptr) {
-      addDependencesBetweenChunks(FirstChunk, FirstChunk);
-      return;
-    }
+  return false;
+}
 
-    // The instructions sequence is as follows.
-    //
-    // ```
-    // Some loads/stores/fp-exceptions (FirstSUs)
-    // Global memory object (FirstBarrier)
-    // ...
-    // Global memory object (LastBarrier)
-    // Some loads/stores/fp-exceptions (LastSUs)
-    // ```
-    //
-    // At this point, add the following loop-carried dependencies.
-    //
-    //   - From LastBarrier to FirstSUs and FirstBarrier
-    //   - From LastSUs to FirstBarrier
-    //   - From loads/stores in LastSUs to loads/stores in FirstSUs
-    //     if they can overlap
-    //
-    // Other loop-carried dependencies, such as LastSUs to load/store between
-    // FirstBarrier and LastBarrier, are implied by the above and existing
-    // dependencies, so we don't add them explicitly.
-    LoadStoreChunk LastChunk;
-    std::vector<SUnit *> LastSUs;
-    SUnit *LastBarrier = nullptr;
-    for (const auto &TSU : reverse(TaggedSUnits)) {
-      SUnit *SU = TSU.getPointer();
-      LastSUs.push_back(SU);
-      if (TSU.getTag() == InstrTag::GlobalMemoryObject) {
-        LastBarrier = SU;
-        break;
-      }
-      updateLoadStoreChunk(SU, LastChunk);
+void LoopCarriedOrderDepsTracker::addDependencesBetweenSUs(
+    const SUnitWithMemInfo &From, const SUnitWithMemInfo &To) {
+  if (From.SU == To.SU)
+    return;
+  if (hasLoopCarriedMemDep(From, To))
+    LoopCarried[From.SU->NodeNum].set(To.SU->NodeNum);
+}
+
+void LoopCarriedOrderDepsTracker::addDependencesBetweenChunks(
+    const LoadStoreChunk &From, const LoadStoreChunk &To) {
+  // Add dependencies for store-to-load (RAW).
+  for (const SUnitWithMemInfo &Src : From.Stores)
+    for (const SUnitWithMemInfo &Dst : To.Loads)
+      addDependencesBetweenSUs(Src, Dst);
+
+  // Add dependencies for load-to-store (WAR).
+  for (const SUnitWithMemInfo &Src : From.Loads)
+    for (const SUnitWithMemInfo &Dst : To.Stores)
+      addDependencesBetweenSUs(Src, Dst);
+
+  // Add dependencies for store-to-store (WAW).
+  for (const SUnitWithMemInfo &Src : From.Stores)
+    for (const SUnitWithMemInfo &Dst : To.Stores)
+      addDependencesBetweenSUs(Src, Dst);
+}
+
+void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
+  // Collect instructions until a barrier event is found.
+  LoadStoreChunk FirstChunk;
+  std::vector<SUnit *> FirstSUs;
+  SUnit *FirstBarrier = nullptr;
+  for (const auto &TSU : TaggedSUnits) {
+    SUnit *SU = TSU.getPointer();
+    FirstSUs.push_back(SU);
+    if (TSU.getTag() == InstrTag::Barrier) {
+      FirstBarrier = SU;
+      break;
     }
+    FirstChunk.append(SU);
+  }
 
-    for (SUnit *SU : FirstSUs)
-      LoopCarried[LastBarrier->NodeNum].set(SU->NodeNum);
-    for (SUnit *SU : LastSUs)
-      LoopCarried[SU->NodeNum].set(FirstBarrier->NodeNum);
-    LoopCarried[FirstBarrier->NodeNum].reset(LastBarrier->NodeNum);
-    addDependencesBetweenChunks(LastChunk, FirstChunk);
+  // If there are no barrier events, then check loop-carried dependencies for
+  // all load/store pairs.
+  if (FirstBarrier == nullptr) {
+    addDependencesBetweenChunks(FirstChunk, FirstChunk);
+    return;
   }
-};
 
-} // end anonymous namespace
+  // The instructions sequence is as follows.
+  //
+  //
+  //   Some loads/stores/fp-exceptions (FirstSUs)
+  //
+  //   Barrier (FirstBarrier)
+  //
+  //   ...
+  //
+  //   Barrier (LastBarrier)
+  //
+  //   Some loads/stores/fp-exceptions (LastSUs)
+  //
+  //
+  // We will add loop-carried dependencies for the following pairs.
+  //
+  // - From LastBarrier to FirstSUs.
+  // - From LastBarrier to FirstBarrier.
+  // - From LastSUs to FirstBarrier.
+  // - From loads/stores in LastSUs to loads/stores in FirstSUs
+  //   if they can overlap
+  //
+  // Other loop-carried dependencies (e.g., from a store in LastSUs to
+  // FirstBarrier) are implied by the above and existing dependencies. So we
+  // won't add them explicitly.
+  LoadStoreChunk LastChunk;
+  std::vector<SUnit *> LastSUs;
+  SUnit *LastBarrier = nullptr;
+  for (const auto &TSU : reverse(TaggedSUnits)) {
+    SUnit *SU = TSU.getPointer();
+    LastSUs.push_back(SU);
+    if (TSU.getTag() == InstrTag::Barrier) {
+      LastBarrier = SU;
+      break;
+    }
+    LastChunk.append(SU);
+  }
+
+  // From LastBarrier to FirstSUs.
+  for (SUnit *SU : FirstSUs)
+    LoopCarried[LastBarrier->NodeNum].set(SU->NodeNum);
+
+  // From LastBarrier to FirstBarrier.
+  LoopCarried[LastBarrier->NodeNum].set(FirstBarrier->NodeNum);
+  // LoopCarried[FirstBarrier->NodeNum].reset(LastBarrier->NodeNum);
+
+  // From LastSUs to FirstBarrier.
+  for (SUnit *SU : LastSUs)
+    LoopCarried[SU->NodeNum].set(FirstBarrier->NodeNum);
+
+  // From loads/stores in LastSUs to loads/stores in FirstSUs
+  addDependencesBetweenChunks(LastChunk, FirstChunk);
+}
 
 /// Add dependencies across iterations.
 LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences(AAResults *AA) {
@@ -1818,7 +1839,7 @@ LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences(AAResults *AA) {
   }
 
   // Add loop-carried order-dependencies
-  LoopCarriedOrderDepsTracker LCODTracker(this, AA);
+  LoopCarriedOrderDepsTracker LCODTracker(this, AA, TII);
   LCODTracker.computeDependencies();
   for (int I = 0; I != int(N); I++)
     for (const int Succ : LCODTracker.getLoopCarried(I).set_bits())
@@ -1901,10 +1922,7 @@ void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
         continue;
 
       // Ignore store-store dependencies when finding circuits for historical
-      // reasons. Adding these edges causes regressions in some important cases.
-      // FIXME: This could lead to an inaccurate estimation of RecMII. By
-      // improving the heuristics after circuit detection, this may not be
-      // necessary.
+      // reasons.
       if (OE.isOrderDep() && OE.getSrc()->getInstr()->mayStore() &&
           OE.getDst()->getInstr()->mayStore())
         continue;
@@ -1937,7 +1955,7 @@ bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
   Blocked.set(V);
 
   for (auto W : AdjK[V]) {
-    if (NumPaths > MaxCircuitPaths)
+    if (NumPaths > MaxPaths)
       break;
     if (W < S)
       continue;
@@ -3039,9 +3057,6 @@ SwingSchedulerDAG::mayOverlapInLaterIter(const MachineInstr *BaseMI,
   if (!BaseMI->mayLoadOrStore() || !OtherMI->mayLoadOrStore())
     return AliasResult::Kind::NoAlias;
 
-  // The conservative assumption is that a dependence between memory operations
-  // may be loop carried. The following code checks when it can be proved that
-  // there is no loop carried dependence.
   int DeltaB, DeltaO, Delta;
   if (!computeDelta(*BaseMI, DeltaB) || !computeDelta(*OtherMI, DeltaO) ||
       DeltaB != DeltaO)
@@ -3621,10 +3636,6 @@ bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
 /// The reason is that although an invalid node order may prevent
 /// the pipeliner from finding a pipelined schedule for arbitrary II,
 /// it does not lead to the generation of incorrect code.
-/// FIXME: Currently, we don't search all circuits. There is an upper limit to
-/// the number of circuits that can be searched. Also, there may be some that
-/// are pruned by heuristics. Therefore, this function may generate false
-/// positives.
 void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
 
   // a sorted vector that maps each SUnit to its index in the NodeOrder
@@ -4222,9 +4233,9 @@ SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
         SDep Dep(Src, SDep::Output, Reg);
         Dep.setLatency(1);
         for (SUnit *Dst : Set) {
-          SwingSchedulerDDGEdge Edge(Dst, Dep, false, true);
+          SwingSchedulerDDGEdge Edge(Dst, Dep, /*IsSucc=*/false,
+                                     /*IsValidationOnly=*/true);
           Edge.setDistance(1);
-          // addEdge(Src, Edge);
           addEdge(Dst, Edge);
         }
       }
@@ -4233,9 +4244,13 @@ SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
       SDep Dep(Src, SDep::Barrier);
       Dep.setLatency(1);
       for (SUnit *Dst : *OrderDep) {
-        SwingSchedulerDDGEdge Edge(Dst, Dep, false,
-                                   !LCE.shouldAddBackEdge(Src, Dst));
+        SwingSchedulerDDGEdge Edge(
+            Dst, Dep, /*IsSucc=*/false,
+            /*IsValidationOnly=*/!LCE.shouldUseWhenScheduling(Src, Dst));
         Edge.setDistance(1);
+
+        // If this edge is used when scheduling, this should be added both Preds
+        // and Succs.
         if (!Edge.isValidationOnly())
           addEdge(Src, Edge);
         addEdge(Dst, Edge);
@@ -4244,16 +4259,6 @@ SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
   }
 }
 
-static bool shouldUseInScheduling(SUnit *DstSU, SDep &Pred) {
-  SUnit *SrcSU = Pred.getSUnit();
-  assert(SrcSU->NodeNum < DstSU->NodeNum && "Invalid order");
-  MachineInstr *SrcMI = SrcSU->getInstr();
-  MachineInstr *DstMI = DstSU->getInstr();
-  return SrcMI->mayLoad() && !DstMI->mayLoad() && DstMI->mayStore() &&
-         !isGlobalMemoryObject(*SrcMI) && !isGlobalMemoryObject(*DstMI) &&
-         !isSuccOrder(SrcSU, DstSU);
-}
-
 ArrayRef<SwingSchedulerDDGEdge>
 SwingSchedulerDDG::getInEdges(const SUnit *SU) const {
   return getEdges(SU).Preds.get(UseLoopCarriedEdges);
@@ -4297,7 +4302,27 @@ bool SwingSchedulerDDG::isValidSchedule(std::vector<SUnit> &SUnits,
   return true;
 }
 
-void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits) {
+void SwingSchedulerDDG::EdgesType::append(const SwingSchedulerDDGEdge &Edge) {
+  bool LoopCarriedOrderDep = Edge.isOrderDep() && Edge.getDistance() != 0;
+  assert(!(LoopCarriedOrderDepsCount != 0 && !LoopCarriedOrderDep) &&
+         "Loop-carried edges should not be added to the underlying edges");
+  Underlying.push_back(Edge);
+  if (LoopCarriedOrderDep)
+    ++LoopCarriedOrderDepsCount;
+}
+
+bool LoopCarriedEdges::shouldUseWhenScheduling(const SUnit *From,
+                                               const SUnit *To) const {
+  if (From->NodeNum < To->NodeNum)
+    return false;
+  auto Ite = BackEdges.find(From);
+  if (Ite == BackEdges.end())
+    return false;
+  return Ite->second.contains(To);
+}
+
+void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
+                                    const TargetInstrInfo *TII) {
   for (SUnit &SU : SUnits) {
     SUnit *Src = &SU;
     if (auto *OrderDep = getOrderDepOrNull(Src)) {
@@ -4311,10 +4336,24 @@ void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits) {
           BackEdges[From].insert(To);
           std::swap(From, To);
         }
-        SDep Pred = Dep;
-        Pred.setSUnit(From);
-        if (shouldUseInScheduling(To, Pred))
+
+        // To keep the previouse behavior, add a foward direction edge if the
+        // following conditions are met:
+        //
+        // - From is a load and To is a store.
+        // - The load apprears before the store in the original basic block.
+        // - There is no barrier/store between the two instructions.
+        // - We cannot reach to the store from the load through current edges in
+        //   the DAG.
+        MachineInstr *FromMI = From->getInstr();
+        MachineInstr *ToMI = To->getInstr();
+        if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&
+            !TII->isGlobalMemoryObject(FromMI) &&
+            !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
+          SDep Pred = Dep;
+          Pred.setSUnit(From);
           To->addPred(Pred);
+        }
       }
     }
   }