[llvm] [MemorySSA] Fix handling of cross-iteration dependencies for calls (PR #187291)

Wed Mar 18 07:59:40 PDT 2026

https://github.com/nikic updated https://github.com/llvm/llvm-project/pull/187291

>From 7e9e23a8c0fca89ea7be2cec430c27fbdc892001 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Fri, 13 Mar 2026 16:03:23 +0100
Subject: [PATCH] [MemorySSA] Fix handling of cross-iteration dependencies for
 calls

The clobber walker has to be careful when it comes to translating
locations across phis. If we're translating across a cycle backedge,
we'll end up working with SSA values from two different cycle
iterations -- something that alias analysis by default assumes is
not the case.

To protect against this, the upwards def walk was already discarding
the access size from the memory location if the pointer was not
loop invariant. This (mostly) avoids this issue for memory locations.

However, the same issue also exists for calls. In this case, it's
not possible to adjust the call used for AA queries in a similar way.
Instead, we can make use of the cross-iteration alias analysis mode,
which has been added some time ago for these kinds of situations.

The basic change here is that the upwards def walk, when translating
across a phi, will enable the cross-iteration mode for calls.
Unfortunately, quite a few places have to be changed in order to
thread through that flag.

We should really also be using the cross-iteration mode for the
memory location case, both because it's more precise, and because
it's more correct (in particular, only changing the size does not
handle correlated select conditions). However, this comes with a
compile-time cost, so I'm only handling the call case for now.
---
 llvm/include/llvm/Analysis/AliasAnalysis.h    | 17 ++++
 llvm/include/llvm/Analysis/MemorySSA.h        | 86 +++++++++++++------
 llvm/lib/Analysis/MemorySSA.cpp               | 57 ++++++------
 .../MemorySSA/call-loop-carried-dependency.ll | 42 +++++++++
 4 files changed, 152 insertions(+), 50 deletions(-)
 create mode 100644 llvm/test/Analysis/MemorySSA/call-loop-carried-dependency.ll

diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index 878b7e7a1fb3b..d9d8ac797688d 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -659,6 +659,8 @@ class BatchAAResults {
   AAQueryInfo AAQI;
   SimpleCaptureAnalysis SimpleCA;
 
+  friend class BatchAACrossIterationScope;
+
 public:
   BatchAAResults(AAResults &AAR) : AA(AAR), AAQI(AAR, &SimpleCA) {}
   BatchAAResults(AAResults &AAR, CaptureAnalysis *CA)
@@ -716,6 +718,21 @@ class BatchAAResults {
   void disableDominatorTree() { AAQI.UseDominatorTree = false; }
 };
 
+/// Temporarily set the cross iteration mode on a BatchAA instance.
+class BatchAACrossIterationScope {
+  BatchAAResults &BAA;
+  bool OrigCrossIteration;
+
+public:
+  BatchAACrossIterationScope(BatchAAResults &BAA, bool CrossIteration)
+      : BAA(BAA), OrigCrossIteration(BAA.AAQI.MayBeCrossIteration) {
+    BAA.AAQI.MayBeCrossIteration = CrossIteration;
+  }
+  ~BatchAACrossIterationScope() {
+    BAA.AAQI.MayBeCrossIteration = OrigCrossIteration;
+  }
+};
+
 /// Temporary typedef for legacy code that uses a generic \c AliasAnalysis
 /// pointer or reference.
 using AliasAnalysis = AAResults;
diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index 3fbc36e311d8b..edfe68748ee0d 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -1107,9 +1107,6 @@ class LLVM_ABI DoNothingMemorySSAWalker final : public MemorySSAWalker {
                                           BatchAAResults &) override;
 };
 
-using MemoryAccessPair = std::pair<MemoryAccess *, MemoryLocation>;
-using ConstMemoryAccessPair = std::pair<const MemoryAccess *, MemoryLocation>;
-
 /// Iterator base class used to implement const and non-const iterators
 /// over the defining accesses of a MemoryAccess.
 template <class T>
@@ -1203,6 +1200,35 @@ template <> struct GraphTraits<Inverse<MemoryAccess *>> {
   static ChildIteratorType child_end(NodeRef N) { return N->user_end(); }
 };
 
+struct UpwardDefsElem {
+  MemoryAccess *MA;
+  MemoryLocation Loc;
+  bool MayBeCrossIteration;
+};
+
+template <> struct DenseMapInfo<UpwardDefsElem> {
+  static inline UpwardDefsElem getEmptyKey() {
+    return {DenseMapInfo<MemoryAccess *>::getEmptyKey(),
+            DenseMapInfo<MemoryLocation>::getEmptyKey(), false};
+  }
+
+  static inline UpwardDefsElem getTombstoneKey() {
+    return {DenseMapInfo<MemoryAccess *>::getTombstoneKey(),
+            DenseMapInfo<MemoryLocation>::getTombstoneKey(), false};
+  }
+
+  static unsigned getHashValue(const UpwardDefsElem &Val) {
+    return hash_combine(DenseMapInfo<MemoryAccess *>::getHashValue(Val.MA),
+                        DenseMapInfo<MemoryLocation>::getHashValue(Val.Loc),
+                        Val.MayBeCrossIteration);
+  }
+
+  static bool isEqual(const UpwardDefsElem &LHS, const UpwardDefsElem &RHS) {
+    return LHS.MA == RHS.MA && LHS.Loc == RHS.Loc &&
+           LHS.MayBeCrossIteration == RHS.MayBeCrossIteration;
+  }
+};
+
 /// Provide an iterator that walks defs, giving both the memory access,
 /// and the current pointer location, updating the pointer location as it
 /// changes due to phi node translation.
@@ -1214,20 +1240,21 @@ template <> struct GraphTraits<Inverse<MemoryAccess *>> {
 class upward_defs_iterator
     : public iterator_facade_base<upward_defs_iterator,
                                   std::forward_iterator_tag,
-                                  const MemoryAccessPair> {
+                                  const UpwardDefsElem> {
   using BaseT = upward_defs_iterator::iterator_facade_base;
 
 public:
-  upward_defs_iterator(const MemoryAccessPair &Info, DominatorTree *DT)
-      : DefIterator(Info.first), Location(Info.second),
-        OriginalAccess(Info.first), DT(DT) {
-    CurrentPair.first = nullptr;
+  upward_defs_iterator(const UpwardDefsElem &Elem, DominatorTree *DT)
+      : DefIterator(Elem.MA), Location(Elem.Loc), OriginalAccess(Elem.MA),
+        DT(DT) {
+    CurrentElem.MA = nullptr;
+    CurrentElem.MayBeCrossIteration = Elem.MayBeCrossIteration;
 
-    WalkingPhi = Info.first && isa<MemoryPhi>(Info.first);
-    fillInCurrentPair();
+    WalkingPhi = Elem.MA && isa<MemoryPhi>(Elem.MA);
+    fillInCurrentElem();
   }
 
-  upward_defs_iterator() { CurrentPair.first = nullptr; }
+  upward_defs_iterator() { CurrentElem.MA = nullptr; }
 
   bool operator==(const upward_defs_iterator &Other) const {
     return DefIterator == Other.DefIterator;
@@ -1236,7 +1263,7 @@ class upward_defs_iterator
   std::iterator_traits<BaseT>::reference operator*() const {
     assert(DefIterator != OriginalAccess->defs_end() &&
            "Tried to access past the end of our iterator");
-    return CurrentPair;
+    return CurrentElem;
   }
 
   using BaseT::operator++;
@@ -1245,7 +1272,7 @@ class upward_defs_iterator
            "Tried to access past the end of the iterator");
     ++DefIterator;
     if (DefIterator != OriginalAccess->defs_end())
-      fillInCurrentPair();
+      fillInCurrentElem();
     return *this;
   }
 
@@ -1257,10 +1284,13 @@ class upward_defs_iterator
   /// MemoryLocation during execution of the containing function.
   LLVM_ABI bool IsGuaranteedLoopInvariant(const Value *Ptr) const;
 
-  void fillInCurrentPair() {
-    CurrentPair.first = *DefIterator;
-    CurrentPair.second = Location;
-    if (WalkingPhi && Location.Ptr) {
+  void fillInCurrentElem() {
+    CurrentElem.MA = *DefIterator;
+    CurrentElem.Loc = Location;
+    if (!WalkingPhi)
+      return;
+
+    if (Location.Ptr) {
       PHITransAddr Translator(
           const_cast<Value *>(Location.Ptr),
           OriginalAccess->getBlock()->getDataLayout(), nullptr);
@@ -1268,21 +1298,27 @@ class upward_defs_iterator
       if (Value *Addr =
               Translator.translateValue(OriginalAccess->getBlock(),
                                         DefIterator.getPhiArgBlock(), DT, true))
-        if (Addr != CurrentPair.second.Ptr)
-          CurrentPair.second = CurrentPair.second.getWithNewPtr(Addr);
+        if (Addr != CurrentElem.Loc.Ptr)
+          CurrentElem.Loc = CurrentElem.Loc.getWithNewPtr(Addr);
 
       // Mark size as unknown, if the location is not guaranteed to be
       // loop-invariant for any possible loop in the function. Setting the size
       // to unknown guarantees that any memory accesses that access locations
       // after the pointer are considered as clobbers, which is important to
       // catch loop carried dependences.
-      if (!IsGuaranteedLoopInvariant(CurrentPair.second.Ptr))
-        CurrentPair.second = CurrentPair.second.getWithNewSize(
+      if (!IsGuaranteedLoopInvariant(CurrentElem.Loc.Ptr))
+        // TODO: We should be using MayBeCrossIteration here as well.
+        CurrentElem.Loc = CurrentElem.Loc.getWithNewSize(
             LocationSize::beforeOrAfterPointer());
+    } else {
+      // We can't easily analyze invariance for calls, so conservatively assume
+      // they may be introducing cross-iteration dependences for any phi
+      // translation.
+      CurrentElem.MayBeCrossIteration = true;
     }
   }
 
-  MemoryAccessPair CurrentPair;
+  UpwardDefsElem CurrentElem;
   memoryaccess_def_iterator DefIterator;
   MemoryLocation Location;
   MemoryAccess *OriginalAccess = nullptr;
@@ -1290,15 +1326,15 @@ class upward_defs_iterator
   bool WalkingPhi = false;
 };
 
-inline upward_defs_iterator
-upward_defs_begin(const MemoryAccessPair &Pair, DominatorTree &DT) {
+inline upward_defs_iterator upward_defs_begin(const UpwardDefsElem &Pair,
+                                              DominatorTree &DT) {
   return upward_defs_iterator(Pair, &DT);
 }
 
 inline upward_defs_iterator upward_defs_end() { return upward_defs_iterator(); }
 
 inline iterator_range<upward_defs_iterator>
-upward_defs(const MemoryAccessPair &Pair, DominatorTree &DT) {
+upward_defs(const UpwardDefsElem &Pair, DominatorTree &DT) {
   return make_range(upward_defs_begin(Pair, DT), upward_defs_end());
 }
 
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index 2e922e80179cf..0f324aa098e75 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -394,7 +394,7 @@ static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysisType &AA,
 /// \param AllowImpreciseClobber Always false, unless we do relaxed verify.
 
 [[maybe_unused]] static void
-checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt,
+checkClobberSanity(MemoryAccess *Start, MemoryAccess *ClobberAt,
                    const MemoryLocation &StartLoc, const MemorySSA &MSSA,
                    const UpwardsMemoryQuery &Query, BatchAAResults &AA,
                    bool AllowImpreciseClobber = false) {
@@ -407,9 +407,9 @@ checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt,
   }
 
   bool FoundClobber = false;
-  DenseSet<ConstMemoryAccessPair> VisitedPhis;
-  SmallVector<ConstMemoryAccessPair, 8> Worklist;
-  Worklist.emplace_back(Start, StartLoc);
+  DenseSet<UpwardDefsElem> VisitedPhis;
+  SmallVector<UpwardDefsElem, 8> Worklist;
+  Worklist.push_back({Start, StartLoc, false});
   // Walk all paths from Start to ClobberAt, while looking for clobbers. If one
   // is found, complain.
   while (!Worklist.empty()) {
@@ -419,7 +419,7 @@ checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt,
     if (!VisitedPhis.insert(MAP).second)
       continue;
 
-    for (const auto *MA : def_chain(MAP.first)) {
+    for (const auto *MA : def_chain(MAP.MA)) {
       if (MA == ClobberAt) {
         if (const auto *MD = dyn_cast<MemoryDef>(MA)) {
           // instructionClobbersQuery isn't essentially free, so don't use `|=`,
@@ -429,7 +429,8 @@ checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt,
           // since MD may only act as a clobber for 1 of N MemoryLocations.
           FoundClobber = FoundClobber || MSSA.isLiveOnEntryDef(MD);
           if (!FoundClobber) {
-            if (instructionClobbersQuery(MD, MAP.second, Query.Inst, AA))
+            BatchAACrossIterationScope _(AA, MAP.MayBeCrossIteration);
+            if (instructionClobbersQuery(MD, MAP.Loc, Query.Inst, AA))
               FoundClobber = true;
           }
         }
@@ -444,7 +445,8 @@ checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt,
         if (MD == Start)
           continue;
 
-        assert(!instructionClobbersQuery(MD, MAP.second, Query.Inst, AA) &&
+        BatchAACrossIterationScope _(AA, MAP.MayBeCrossIteration);
+        assert(!instructionClobbersQuery(MD, MAP.Loc, Query.Inst, AA) &&
                "Found clobber before reaching ClobberAt!");
         continue;
       }
@@ -459,9 +461,7 @@ checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt,
       assert(isa<MemoryPhi>(MA));
 
       // Add reachable phi predecessors
-      for (auto ItB = upward_defs_begin(
-                    {const_cast<MemoryAccess *>(MA), MAP.second},
-                    MSSA.getDomTree()),
+      for (auto ItB = upward_defs_begin(MAP, MSSA.getDomTree()),
                 ItE = upward_defs_end();
            ItB != ItE; ++ItB)
         if (MSSA.getDomTree().isReachableFromEntry(ItB.getPhiArgBlock()))
@@ -500,14 +500,16 @@ class ClobberWalker {
     MemoryAccess *First;
     MemoryAccess *Last;
     std::optional<ListIndex> Previous;
+    bool MayBeCrossIteration;
 
     DefPath(const MemoryLocation &Loc, MemoryAccess *First, MemoryAccess *Last,
-            std::optional<ListIndex> Previous)
-        : Loc(Loc), First(First), Last(Last), Previous(Previous) {}
+            bool MayBeCrossIteration, std::optional<ListIndex> Previous)
+        : Loc(Loc), First(First), Last(Last), Previous(Previous),
+          MayBeCrossIteration(MayBeCrossIteration) {}
 
     DefPath(const MemoryLocation &Loc, MemoryAccess *Init,
-            std::optional<ListIndex> Previous)
-        : DefPath(Loc, Init, Init, Previous) {}
+            bool MayBeCrossIteration, std::optional<ListIndex> Previous)
+        : DefPath(Loc, Init, Init, MayBeCrossIteration, Previous) {}
   };
 
   const MemorySSA &MSSA;
@@ -521,7 +523,7 @@ class ClobberWalker {
   SmallVector<DefPath, 32> Paths;
   // List of visited <Access, Location> pairs; we can skip paths already
   // visited with the same memory location.
-  DenseSet<ConstMemoryAccessPair> VisitedPhis;
+  DenseSet<UpwardDefsElem> VisitedPhis;
 
   /// Find the nearest def or phi that `From` can legally be optimized to.
   const MemoryAccess *getWalkTarget(const MemoryPhi *From) const {
@@ -578,6 +580,7 @@ class ClobberWalker {
         if (!--*UpwardWalkLimit)
           return {Current, true};
 
+        BatchAACrossIterationScope _(*AA, Desc.MayBeCrossIteration);
         if (instructionClobbersQuery(MD, Desc.Loc, Query->Inst, *AA))
           return {MD, true};
       }
@@ -592,12 +595,13 @@ class ClobberWalker {
   }
 
   void addSearches(MemoryPhi *Phi, SmallVectorImpl<ListIndex> &PausedSearches,
-                   ListIndex PriorNode) {
-    auto UpwardDefsBegin = upward_defs_begin({Phi, Paths[PriorNode].Loc}, DT);
+                   ListIndex PriorNode, bool MayBeCrossIteration) {
+    auto UpwardDefsBegin =
+        upward_defs_begin({Phi, Paths[PriorNode].Loc, MayBeCrossIteration}, DT);
     auto UpwardDefs = make_range(UpwardDefsBegin, upward_defs_end());
-    for (const MemoryAccessPair &P : UpwardDefs) {
+    for (const UpwardDefsElem &E : UpwardDefs) {
       PausedSearches.push_back(Paths.size());
-      Paths.emplace_back(P.second, P.first, PriorNode);
+      Paths.emplace_back(E.Loc, E.MA, E.MayBeCrossIteration, PriorNode);
     }
   }
 
@@ -649,7 +653,8 @@ class ClobberWalker {
       //   - We still cache things for A, so C only needs to walk up a bit.
       // If this behavior becomes problematic, we can fix without a ton of extra
       // work.
-      if (!VisitedPhis.insert({Node.Last, Node.Loc}).second)
+      if (!VisitedPhis.insert({Node.Last, Node.Loc, Node.MayBeCrossIteration})
+               .second)
         continue;
 
       const MemoryAccess *SkipStopWhere = nullptr;
@@ -686,7 +691,8 @@ class ClobberWalker {
       }
 
       assert(!MSSA.isLiveOnEntryDef(Res.Result) && "liveOnEntry is a clobber");
-      addSearches(cast<MemoryPhi>(Res.Result), PausedSearches, PathIndex);
+      addSearches(cast<MemoryPhi>(Res.Result), PausedSearches, PathIndex,
+                  Node.MayBeCrossIteration);
     }
 
     return std::nullopt;
@@ -766,7 +772,7 @@ class ClobberWalker {
     assert(Paths.empty() && VisitedPhis.empty() &&
            "Reset the optimization state.");
 
-    Paths.emplace_back(Loc, Start, Phi, std::nullopt);
+    Paths.emplace_back(Loc, Start, Phi, false, std::nullopt);
     // Stores how many "valid" optimization nodes we had prior to calling
     // addSearches/getBlockingAccess. Necessary for caching if we had a blocker.
     auto PriorPathsSize = Paths.size();
@@ -775,7 +781,7 @@ class ClobberWalker {
     SmallVector<ListIndex, 8> NewPaused;
     SmallVector<TerminatedPath, 4> TerminatedPaths;
 
-    addSearches(Phi, PausedSearches, 0);
+    addSearches(Phi, PausedSearches, 0, false);
 
     // Moves the TerminatedPath with the "most dominated" Clobber to the end of
     // Paths.
@@ -903,7 +909,8 @@ class ClobberWalker {
       PriorPathsSize = Paths.size();
       PausedSearches.clear();
       for (ListIndex I : NewPaused)
-        addSearches(DefChainPhi, PausedSearches, I);
+        addSearches(DefChainPhi, PausedSearches, I,
+                    Paths[I].MayBeCrossIteration);
       NewPaused.clear();
 
       Current = DefChainPhi;
@@ -942,7 +949,7 @@ class ClobberWalker {
     if (auto *MU = dyn_cast<MemoryUse>(Start))
       Current = MU->getDefiningAccess();
 
-    DefPath FirstDesc(Q.StartingLoc, Current, Current, std::nullopt);
+    DefPath FirstDesc(Q.StartingLoc, Current, Current, false, std::nullopt);
     // Fast path for the overly-common case (no crazy phi optimization
     // necessary)
     UpwardsWalkResult WalkResult = walkToPhiOrClobber(FirstDesc);
diff --git a/llvm/test/Analysis/MemorySSA/call-loop-carried-dependency.ll b/llvm/test/Analysis/MemorySSA/call-loop-carried-dependency.ll
new file mode 100644
index 0000000000000..c27de9f798149
--- /dev/null
+++ b/llvm/test/Analysis/MemorySSA/call-loop-carried-dependency.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-inst-comments --version 6
+; RUN: opt -passes='print<memoryssa-walker>' -disable-output < %s 2>&1 | FileCheck %s
+
+; Make sure that the memcpy is clobbered by the memoryphi, not by liveOnEntry.
+define void @test(i1 %c, ptr %path, ptr %name) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[PATH:%.*]], ptr [[NAME:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP:%.*]] = alloca [260 x i8], align 16
+; CHECK-NEXT:    br label %[[WHILE_BODY:.*]]
+; CHECK:       [[WHILE_BODY]]:
+; CHECK-NEXT:  ; 3 = MemoryPhi({entry,liveOnEntry},{while.body,1})
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[WHILE_BODY]] ], [ 259, %[[ENTRY]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; CHECK-NEXT:    [[TMP_IV:%.*]] = getelementptr inbounds [260 x i8], ptr [[TMP]], i64 0, i64 [[IV]]
+; CHECK-NEXT:  ; 1 = MemoryDef(3)->3 - clobbered by 3 = MemoryPhi({entry,liveOnEntry},{while.body,1})
+; CHECK-NEXT:    store i8 42, ptr [[TMP_IV]], align 1
+; CHECK-NEXT:    br i1 [[C]], label %[[WHILE_BODY]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[TMP_IV_1:%.*]] = getelementptr inbounds i8, ptr [[TMP_IV]], i64 1
+; CHECK-NEXT:    [[SUB7:%.*]] = sub nsw i64 259, [[IV]]
+; CHECK-NEXT:  ; 2 = MemoryDef(1)->3 - clobbered by 3 = MemoryPhi({entry,liveOnEntry},{while.body,1})
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[NAME]], ptr nonnull align 1 [[TMP_IV_1]], i64 [[SUB7]], i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp = alloca [260 x i8], align 16
+  br label %while.body
+
+while.body:
+  %iv = phi i64 [ %iv.next, %while.body ], [ 259, %entry ]
+  %iv.next = add nsw i64 %iv, -1
+  %tmp.iv = getelementptr inbounds [260 x i8], ptr %tmp, i64 0, i64 %iv
+  store i8 42, ptr %tmp.iv, align 1
+  br i1 %c, label %while.body, label %exit
+
+exit:
+  %tmp.iv.1 = getelementptr inbounds i8, ptr %tmp.iv, i64 1
+  %sub7 = sub nsw i64 259, %iv
+  call void @llvm.memcpy(ptr align 1 %name, ptr nonnull align 1 %tmp.iv.1, i64 %sub7, i1 false)
+  ret void
+}