[llvm] [LoopVectorize][LAA] Hoist load in memory IV to allow vectorization (PR #168312)

Sun Nov 16 19:20:28 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-analysis

Author: Felipe Magno de Almeida (felipealmeida)

<details>
<summary>Changes</summary>

Adds VPScalarIVPromotionRecipe recipe to promote memory IV to scalar
IV.

Use SCEV mutilplied by VFxUF or EVL to multiply the SCEV between load
and store.

The code that this patch allows vectorization is like this:

```
while.body:
  %theFirst.addr.0112 = phi ptr [ %incdec.ptr9, %while.body ], [ %theFirst, %while.body.preheader ]
  %thePointer.0111 = phi ptr [ %incdec.ptr, %while.body ], [ %add.ptr.i, %while.body.preheader ]
  %1 = load i16, ptr %theFirst.addr.0112, align 2
  store i16 %1, ptr %thePointer.0111, align 2
  %incdec.ptr = getelementptr inbounds nuw i8, ptr %thePointer.0111, i64 2
  %2 = load i64, ptr %m_size_ptr, align 8
  %inc = add i64 %2, 1
  store i64 %inc, ptr %m_size_ptr, align 8
  %incdec.ptr9 = getelementptr inbounds nuw i8, ptr %theFirst.addr.0112, i64 2
  %cmp7.not = icmp eq ptr %incdec.ptr9, %theLast
  br i1 %cmp7.not, label %cleanup.loopexit, label %while.body
```

As you can see, %m_size_ptr is a loop invariant pointer and can be
promoted to a IV scalar and then execute vectorization.


---

Patch is 110.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168312.diff


44 Files Affected:

- (modified) llvm/include/llvm/Analysis/LoopAccessAnalysis.h (+22-6) 
- (modified) llvm/lib/Analysis/LoopAccessAnalysis.cpp (+259-9) 
- (modified) llvm/lib/Transforms/Scalar/LoopDistribute.cpp (+4-2) 
- (modified) llvm/lib/Transforms/Scalar/LoopFlatten.cpp (+1-1) 
- (modified) llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp (+2-1) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h (+7-3) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+42-11) 
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+55-3) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+5-3) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp (+106-9) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h (+15) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+54-1) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+41-4) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+1-1) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp (+2-1) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanUtils.cpp (+4-1) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (+1) 
- (modified) llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp (+4) 
- (modified) llvm/test/Analysis/LoopAccessAnalysis/invalidation.ll (+1) 
- (modified) llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll (+28-25) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll (+4-4) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll (+12-12) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll (+4-4) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll (+1-1) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll (+2-2) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll (+16-1) 
- (modified) llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll (+5-50) 
- (added) llvm/test/Transforms/LoopVectorize/memory-iv-promotion.ll (+60) 
- (modified) llvm/test/Transforms/LoopVectorize/pointer-induction.ll (+3-3) 
- (modified) llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll (+6-6) 
- (modified) llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll (+12-12) 
- (modified) llvm/test/Transforms/LoopVectorize/vplan-printing.ll (+2-2) 
- (modified) llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp (+6-2) 
- (modified) llvm/unittests/Transforms/Vectorize/VPlanTestBase.h (+19-1) 


``````````diff

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index c85ef3e131068..edec066083abd 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -28,6 +28,7 @@ class DataLayout;
 class Loop;
 class raw_ostream;
 class TargetTransformInfo;
+class MemorySSA;
 
 /// Collection of parameters shared beetween the Loop Vectorizer and the
 /// Loop Access Analysis.
@@ -181,11 +182,12 @@ class MemoryDepChecker {
   };
 
   MemoryDepChecker(PredicatedScalarEvolution &PSE, AssumptionCache *AC,
-                   DominatorTree *DT, const Loop *L,
+                   MemorySSA *MSSA, DominatorTree *DT, AAResults *AA,
+                   const Loop *L,
                    const DenseMap<Value *, const SCEV *> &SymbolicStrides,
                    unsigned MaxTargetVectorWidthInBits,
                    std::optional<ScalarEvolution::LoopGuards> &LoopGuards)
-      : PSE(PSE), AC(AC), DT(DT), InnermostLoop(L),
+      : PSE(PSE), AC(AC), DT(DT), MSSA(MSSA), AA(AA), InnermostLoop(L),
         SymbolicStrides(SymbolicStrides),
         MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits),
         LoopGuards(LoopGuards) {}
@@ -292,6 +294,14 @@ class MemoryDepChecker {
     return PointerBounds;
   }
 
+  /// Return if a Load can be hoisted in this loop with a pattern of a
+  /// memory induction variable. This assumes a alias runtime check
+  /// will be used before hoisting.
+  bool
+  isInvariantLoadHoistable(LoadInst *L, ScalarEvolution &SE, StoreInst **S,
+                           const SCEV **Step,
+                           SmallVectorImpl<Instruction *> *Instructions) const;
+
   DominatorTree *getDT() const {
     assert(DT && "requested DT, but it is not available");
     return DT;
@@ -312,6 +322,8 @@ class MemoryDepChecker {
 
   AssumptionCache *AC;
   DominatorTree *DT;
+  MemorySSA *MSSA;
+  AAResults *AA;
 
   const Loop *InnermostLoop;
 
@@ -692,7 +704,7 @@ class LoopAccessInfo {
                           const TargetTransformInfo *TTI,
                           const TargetLibraryInfo *TLI, AAResults *AA,
                           DominatorTree *DT, LoopInfo *LI, AssumptionCache *AC,
-                          bool AllowPartial = false);
+                          MemorySSA *MSSA, bool AllowPartial = false);
 
   /// Return true we can analyze the memory accesses in the loop and there are
   /// no memory dependence cycles. Note that for dependences between loads &
@@ -786,7 +798,8 @@ class LoopAccessInfo {
   /// Analyze the loop. Returns true if all memory access in the loop can be
   /// vectorized.
   bool analyzeLoop(AAResults *AA, const LoopInfo *LI,
-                   const TargetLibraryInfo *TLI, DominatorTree *DT);
+                   const TargetLibraryInfo *TLI, DominatorTree *DT,
+                   MemorySSA *MSSA);
 
   /// Check if the structure of the loop allows it to be analyzed by this
   /// pass.
@@ -963,12 +976,15 @@ class LoopAccessInfoManager {
   TargetTransformInfo *TTI;
   const TargetLibraryInfo *TLI = nullptr;
   AssumptionCache *AC;
+  MemorySSA *MSSA;
 
 public:
   LoopAccessInfoManager(ScalarEvolution &SE, AAResults &AA, DominatorTree &DT,
                         LoopInfo &LI, TargetTransformInfo *TTI,
-                        const TargetLibraryInfo *TLI, AssumptionCache *AC)
-      : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI), AC(AC) {}
+                        const TargetLibraryInfo *TLI, AssumptionCache *AC,
+                        MemorySSA *MSSA)
+      : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI), AC(AC), MSSA(MSSA) {
+  }
 
   LLVM_ABI const LoopAccessInfo &getInfo(Loop &L, bool AllowPartial = false);
 
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 5d88e5f54e3d6..4a8871ddbb7eb 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -1777,6 +1778,232 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
   return Diff == 1;
 }
 
+/// Collects all subexpressions that appear within a given SCEV tree.
+struct SCEVSubexprCollector : public SCEVVisitor<SCEVSubexprCollector, void> {
+  SmallPtrSet<const SCEV *, 4> &Subs;
+  SCEVSubexprCollector(SmallPtrSet<const SCEV *, 4> &S) : Subs(S) {}
+
+  template <typename Operands> void visitOperands(Operands operands) {
+    for (auto *Op : operands)
+      visit(Op);
+  }
+  void visitConstant(const SCEVConstant *C) { Subs.insert(C); }
+  void visitUnknown(const SCEVUnknown *U) { Subs.insert(U); }
+  void visitAddExpr(const SCEVAddExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitMulExpr(const SCEVMulExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitAddRecExpr(const SCEVAddRecExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitSMaxExpr(const SCEVSMaxExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitSMinExpr(const SCEVSMinExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitUMinExpr(const SCEVUMinExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitUMaxExpr(const SCEVUMaxExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitMinMaxExpr(const SCEVMinMaxExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitUDivExpr(const SCEVUDivExpr *E) {
+    Subs.insert(E);
+    visit(E->getLHS());
+    visit(E->getRHS());
+  }
+  void visitZeroExtendExpr(const SCEVZeroExtendExpr *E) {
+    Subs.insert(E);
+    visit(E->getOperand());
+  }
+  void visitSignExtendExpr(const SCEVSignExtendExpr *E) {
+    Subs.insert(E);
+    visit(E->getOperand());
+  }
+  void visitTruncateExpr(const SCEVTruncateExpr *E) {
+    Subs.insert(E);
+    visit(E->getOperand());
+  }
+  void visitCouldNotCompute(const SCEVCouldNotCompute *E) { Subs.insert(E); }
+  void visitVScale(const SCEVVScale *E) {
+    Subs.insert(E);
+    visitOperands(E->operands());
+  }
+  void visitPtrToIntExpr(const SCEVPtrToIntExpr *E) {
+    Subs.insert(E);
+    visitOperands(E->operands());
+  }
+  void visitSequentialUMinExpr(const SCEVSequentialUMinExpr *E) {
+    Subs.insert(E);
+    visitOperands(E->operands());
+  }
+};
+
+bool MemoryDepChecker::isInvariantLoadHoistable(
+    LoadInst *L, ScalarEvolution &SE, StoreInst **S, const SCEV **StepSCEV,
+    SmallVectorImpl<Instruction *> *Instructions) const {
+  assert(L != nullptr);
+  assert(InnermostLoop->isLoopInvariant(L->getPointerOperand()));
+
+  if (!MSSA)
+    return false;
+
+  MemoryAccess *MA = MSSA->getMemoryAccess(L);
+  auto QLoc = MemoryLocation::get(L);
+
+  SmallVector<StoreInst *> Stores;
+  SmallVector<LoadInst *> Loads;
+
+  for (auto &&I : *InnermostLoop->getHeader()) {
+    if (auto *Store = dyn_cast<StoreInst>(&I)) {
+      AliasResult AR = AA->alias(MemoryLocation::get(Store), QLoc);
+      if (AR == AliasResult::MustAlias)
+        Stores.push_back(Store);
+    }
+    if (auto *Load = dyn_cast<LoadInst>(&I)) {
+      AliasResult AR = AA->alias(MemoryLocation::get(Load), QLoc);
+      if (AR == AliasResult::MustAlias)
+        Loads.push_back(Load);
+    }
+  }
+
+  if (Loads.size() != 1 || Loads[0]->isVolatile() || Stores.size() != 1 ||
+      Stores[0]->isVolatile())
+    return false;
+
+  // I have the memory PHI, so I know where is the backedge
+  // I have to find all memory accesses to the same cell (that I care)
+  // There should be a single memory use and a single memorydef
+  // memory use should have MemoryPhi as transitive clobber
+  // backedge should have the MemoryDef as a transitive clobber (must-alias) (?)
+  MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(MA);
+  while (auto *MD = dyn_cast<MemoryUseOrDef>(Clobber)) {
+    Instruction *DefI = MD->getMemoryInst();
+
+    if (!DefI)
+      return false;
+
+    AliasResult AR = AA->alias(MemoryLocation::get(DefI), QLoc);
+
+    Clobber = MD->getDefiningAccess();
+
+    // We assume runtime aliasing check will be used
+    if (AR == AliasResult::MustAlias)
+      return false;
+  }
+
+  MemoryAccess *MS = MSSA->getMemoryAccess(Stores[0]);
+  MemoryAccess *StoreClobber = MSSA->getWalker()->getClobberingMemoryAccess(MS);
+  while (true) {
+    if (isa<MemoryPhi>(StoreClobber))
+      break;
+    if (auto *MD = dyn_cast<MemoryUseOrDef>(StoreClobber)) {
+      Instruction *DefI = MD->getMemoryInst();
+
+      if (!DefI)
+        return false;
+
+      AliasResult AR = AA->alias(MemoryLocation::get(DefI), QLoc);
+
+      StoreClobber = MD->getDefiningAccess();
+
+      if (AR == AliasResult::MustAlias)
+        return false;
+    }
+  }
+
+  if (!SE.isSCEVable(Stores[0]->getValueOperand()->getType()))
+    return false;
+
+  const SCEV *LoadSCEV = SE.getUnknown(L);
+  const SCEV *StoreSCEV = SE.getSCEV(Stores[0]->getValueOperand());
+
+  auto Step = SE.getMinusSCEV(StoreSCEV, LoadSCEV);
+
+  if (isa<SCEVCouldNotCompute>(Step) ||
+      !SE.isLoopInvariant(Step, InnermostLoop))
+    return false;
+
+  SmallVector<Instruction *, 4> WL;
+
+  SmallPtrSet<Instruction *, 4> Slice;
+  SmallPtrSet<const SCEV *, 4> Subs;
+  SCEVSubexprCollector Collector(Subs);
+  Collector.visit(StoreSCEV);
+
+  // Register all instructions that matches the SCEV
+  // to allow its removal when hoisting it and
+  // re-expanding the SCEV
+  auto enqueueIfMatches = [&](Value *X) {
+    if (auto *XI = dyn_cast<Instruction>(X)) {
+      const SCEV *SX = SE.getSCEV(XI);
+      if (Subs.contains(SX) && Slice.insert(XI).second)
+        WL.push_back(XI);
+    }
+  };
+
+  enqueueIfMatches(Stores[0]->getValueOperand());
+
+  while (!WL.empty()) {
+    Instruction *I = WL.pop_back_val();
+
+    for (Value *Op : I->operands()) {
+      if (isa<Constant>(Op) || isa<Argument>(Op))
+        continue;
+      enqueueIfMatches(Op);
+    }
+  }
+
+  auto hasExternalUsers =
+      [&Stores](const SmallPtrSetImpl<Instruction *> &Slice) {
+        for (Instruction *I : Slice)
+          for (Use &U : I->uses())
+            if (auto *UserI = dyn_cast<Instruction>(U.getUser())) {
+              if (isa<DbgInfoIntrinsic>(UserI))
+                continue;
+              if (!Slice.count(UserI) &&
+                  !std::count(Stores.begin(), Stores.end(), UserI))
+                return true;
+            }
+        return false;
+      };
+
+  if (hasExternalUsers(Slice))
+    return false;
+
+  if (S)
+    *S = Stores[0];
+  if (StepSCEV)
+    *StepSCEV = Step;
+
+  if (Instructions)
+    Instructions->insert(Instructions->end(), Slice.begin(), Slice.end());
+
+  return true;
+}
+
 void MemoryDepChecker::addAccess(StoreInst *SI) {
   visitPointers(SI->getPointerOperand(), *InnermostLoop,
                 [this, SI](Value *Ptr) {
@@ -2102,6 +2329,19 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   int64_t StrideBPtrInt = *StrideBPtr;
   LLVM_DEBUG(dbgs() << "LAA:  Src induction step: " << StrideAPtrInt
                     << " Sink induction step: " << StrideBPtrInt << "\n");
+
+  if (!StrideAPtrInt && !StrideBPtrInt && !(AIsWrite && BIsWrite) &&
+      (AIsWrite || BIsWrite) && !isa<UndefValue>(APtr) &&
+      InnermostLoop->isLoopInvariant(APtr) &&
+      InnermostLoop->isLoopInvariant(BPtr)) {
+    LoadInst *L = dyn_cast<LoadInst>(AIsWrite ? BInst : AInst);
+    if (InnermostLoop->isLoopInvariant(L->getPointerOperand()))
+      if (L && isInvariantLoadHoistable(L, SE, nullptr, nullptr, nullptr))
+        ShouldRetryWithRuntimeChecks = true;
+
+    return MemoryDepChecker::Dependence::Unknown;
+  }
+
   // At least Src or Sink are loop invariant and the other is strided or
   // invariant. We can generate a runtime check to disambiguate the accesses.
   if (!StrideAPtrInt || !StrideBPtrInt)
@@ -2505,7 +2745,7 @@ bool LoopAccessInfo::canAnalyzeLoop() {
 
 bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
                                  const TargetLibraryInfo *TLI,
-                                 DominatorTree *DT) {
+                                 DominatorTree *DT, MemorySSA *MSSA) {
   // Holds the Load and Store instructions.
   SmallVector<LoadInst *, 16> Loads;
   SmallVector<StoreInst *, 16> Stores;
@@ -2715,9 +2955,15 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
     // See if there is an unsafe dependency between a load to a uniform address and
     // store to the same uniform address.
     if (UniformStores.contains(Ptr)) {
-      LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
-                           "load and uniform store to the same address!\n");
-      HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
+      auto &SE = *PSE->getSE();
+      if (TheLoop->isLoopInvariant(LD->getPointerOperand()) &&
+          !getDepChecker().isInvariantLoadHoistable(LD, SE, nullptr, nullptr,
+                                                    nullptr)) {
+        LLVM_DEBUG(
+            dbgs() << "LAA: Found an unsafe dependency between a uniform "
+                      "load and uniform store to the same address!\n");
+        HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
+      }
     }
 
     MemoryLocation Loc = MemoryLocation::get(LD);
@@ -3064,7 +3310,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const TargetTransformInfo *TTI,
                                const TargetLibraryInfo *TLI, AAResults *AA,
                                DominatorTree *DT, LoopInfo *LI,
-                               AssumptionCache *AC, bool AllowPartial)
+                               AssumptionCache *AC, MemorySSA *MSSA,
+                               bool AllowPartial)
     : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
       PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) {
   unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
@@ -3075,11 +3322,12 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
         TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2;
 
   DepChecker = std::make_unique<MemoryDepChecker>(
-      *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits, LoopGuards);
+      *PSE, AC, MSSA, DT, AA, L, SymbolicStrides, MaxTargetVectorWidthInBits,
+      LoopGuards);
   PtrRtChecking =
       std::make_unique<RuntimePointerChecking>(*DepChecker, SE, LoopGuards);
   if (canAnalyzeLoop())
-    CanVecMem = analyzeLoop(AA, LI, TLI, DT);
+    CanVecMem = analyzeLoop(AA, LI, TLI, DT, MSSA);
 }
 
 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
@@ -3145,7 +3393,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
   // or if it was created with a different value of AllowPartial.
   if (Inserted || It->second->hasAllowPartial() != AllowPartial)
     It->second = std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT,
-                                                  &LI, AC, AllowPartial);
+                                                  &LI, AC, MSSA, AllowPartial);
 
   return *It->second;
 }
@@ -3189,7 +3437,9 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
   auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
   auto &AC = FAM.getResult<AssumptionAnalysis>(F);
-  return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC);
+  auto &MSSA = FAM.getResult<MemorySSAAnalysis>(F);
+  return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC,
+                               &MSSA.getMSSA());
 }
 
 AnalysisKey LoopAccessAnalysis::Key;
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 0c8b9043fcbbb..ebda2c96b75e6 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -680,12 +680,14 @@ class LoopDistributeForLoop {
 
     // Currently, we only distribute to isolate the part of the loop with
     // dependence cycles to enable partial vectorization.
-    if (LAI->canVectorizeMemory())
+    if (!LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress() &&
+        LAI->canVectorizeMemory())
       return fail("MemOpsCanBeVectorized",
                   "memory operations are safe for vectorization");
 
     auto *Dependences = LAI->getDepChecker().getDependences();
-    if (!Dependences || Dependences->empty())
+    if (!LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress() &&
+        (!Dependences || Dependences->empty()))
       return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
 
     LLVM_DEBUG(dbgs() << "LDist: Found a candidate loop: "
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 04039b885f3c5..72a2dcb294a57 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -1010,7 +1010,7 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
   // this pass will simplify all loops that contain inner loops,
   // regardless of whether anything ends up being flattened.
   LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr,
-                             &AR.AC);
+                             &AR.AC, AR.MSSA);
   for (Loop *InnerLoop : LN.getLoops()) {
     auto *OuterLoop = InnerLoop->getParentLoop();
     if (!OuterLoop)
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 3aed643ee8065..e8d5fc870137f 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -549,7 +549,8 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
   const Function *F = L.getHeader()->getParent();
   OptimizationRemarkEmitter ORE(F);
 
-  LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, &LAR.AC);
+  LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, &LAR.AC,
+                             LAR.MSSA);
   if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT))
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 04b05627fa769..0ffabdda21bdf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -465,6 +465,8 @@ class LoopVectorizationPlanner {
 
   PredicatedScalarEvolution &PSE;
 
+  LoopAccessInfoManager *LAIs;
+
   const LoopVectorizeHints &Hints;
 
   OptimizationRemarkEmitter *ORE;
@@ -498,10 +500,10 @@ class LoopVectorizationPlanner {
       Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
       const TargetTransformInfo &TTI, LoopVectorizationLegality *Legal,
       LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI,
-      PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints,
-      OptimizationRemarkEmitter *ORE)
+      PredicatedScalarEvolution &PSE, LoopAccessInfoManager *LAIs,
+      const LoopVectorizeHints &Hints, OptimizationRemarkEmitter *ORE)
       : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
-        IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {}
+        IAI(IAI), PSE(PSE), LAIs(LAIs), Hints(Hints), ORE(ORE) {}
 
   /// Build VPlans for the specified \p UserVF and \p UserIC if they are
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
@@ -628,6 +630,8 @@ class LoopVectorizationPlanner {
                                   VPRecipeBuilder &RecipeBuilder,
                                   ElementCount MinVF);
 
+  void adjustScalarIVPromotions(VPlanPtr &Plan);
+
   //...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/168312