[llvm] [LoopVectorize][LAA] Hoist load in memory IV to allow vectorization (PR #168312)

Sun Nov 16 19:19:56 PST 2025

https://github.com/felipealmeida created https://github.com/llvm/llvm-project/pull/168312

Adds VPScalarIVPromotionRecipe recipe to promote memory IV to scalar
IV.

Use SCEV mutilplied by VFxUF or EVL to multiply the SCEV between load
and store.

The code that this patch allows vectorization is like this:

```
while.body:
  %theFirst.addr.0112 = phi ptr [ %incdec.ptr9, %while.body ], [ %theFirst, %while.body.preheader ]
  %thePointer.0111 = phi ptr [ %incdec.ptr, %while.body ], [ %add.ptr.i, %while.body.preheader ]
  %1 = load i16, ptr %theFirst.addr.0112, align 2
  store i16 %1, ptr %thePointer.0111, align 2
  %incdec.ptr = getelementptr inbounds nuw i8, ptr %thePointer.0111, i64 2
  %2 = load i64, ptr %m_size_ptr, align 8
  %inc = add i64 %2, 1
  store i64 %inc, ptr %m_size_ptr, align 8
  %incdec.ptr9 = getelementptr inbounds nuw i8, ptr %theFirst.addr.0112, i64 2
  %cmp7.not = icmp eq ptr %incdec.ptr9, %theLast
  br i1 %cmp7.not, label %cleanup.loopexit, label %while.body
```

As you can see, %m_size_ptr is a loop invariant pointer and can be
promoted to a IV scalar and then execute vectorization.


>From 74f06abb3e4361b1e777a295889a11e5e851854c Mon Sep 17 00:00:00 2001
From: Felipe Magno de Almeida <felipe at expertise.dev>
Date: Mon, 10 Nov 2025 18:04:59 -0300
Subject: [PATCH 1/3] Test for memory IV promotion

---
 .../LoopVectorize/memory-iv-promotion.ll      | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/memory-iv-promotion.ll

diff --git a/llvm/test/Transforms/LoopVectorize/memory-iv-promotion.ll b/llvm/test/Transforms/LoopVectorize/memory-iv-promotion.ll
new file mode 100644
index 0000000000000..192f2a55959e9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/memory-iv-promotion.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-vectorize -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s --check-prefix=AARCH64
+; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s --check-prefix=X86_64
+
+; Testcase extraído de ElemAttribute.cpp
+; Foca no loop while.body que copia elementos i16
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+
+; This test introduces a case where a memory induction variable
+; stops vectorization. This test will be updated when hoisting loads
+; for this memory induction variable and vectorization is possible.
+define void @test_copy_loop(ptr %theFirst, ptr %theLast, ptr %dest_base, ptr %m_size_ptr) {
+; AARCH64-LABEL: define void @test_copy_loop(
+; AARCH64-SAME: ptr [[THEFIRST:%.*]], ptr [[THELAST:%.*]], ptr [[DEST_BASE:%.*]], ptr [[M_SIZE_PTR:%.*]]) {
+; AARCH64-NEXT:  [[ENTRY:.*:]]
+; AARCH64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[M_SIZE_PTR]], align 8
+; AARCH64-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds nuw i16, ptr [[DEST_BASE]], i64 [[TMP0]]
+; AARCH64-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[THEFIRST]], [[THELAST]]
+; AARCH64-NEXT:    br i1 [[CMP_NOT]], label %[[CLEANUP:.*]], label %[[WHILE_BODY_PREHEADER:.*]]
+; AARCH64:       [[WHILE_BODY_PREHEADER]]:
+; AARCH64-NEXT:    br label %[[WHILE_BODY:.*]]
+; AARCH64:       [[WHILE_BODY]]:
+; AARCH64-NEXT:    [[THEFIRST_ADDR_0112:%.*]] = phi ptr [ [[INCDEC_PTR9:%.*]], %[[WHILE_BODY]] ], [ [[THEFIRST]], %[[WHILE_BODY_PREHEADER]] ]
+; AARCH64-NEXT:    [[THEPOINTER_0111:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], %[[WHILE_BODY]] ], [ [[ADD_PTR_I]], %[[WHILE_BODY_PREHEADER]] ]
+; AARCH64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[THEFIRST_ADDR_0112]], align 2
+; AARCH64-NEXT:    store i16 [[TMP1]], ptr [[THEPOINTER_0111]], align 2
+; AARCH64-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[THEPOINTER_0111]], i64 2
+; AARCH64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[M_SIZE_PTR]], align 8
+; AARCH64-NEXT:    [[INC:%.*]] = add i64 [[TMP2]], 1
+; AARCH64-NEXT:    store i64 [[INC]], ptr [[M_SIZE_PTR]], align 8
+; AARCH64-NEXT:    [[INCDEC_PTR9]] = getelementptr inbounds nuw i8, ptr [[THEFIRST_ADDR_0112]], i64 2
+; AARCH64-NEXT:    [[CMP7_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR9]], [[THELAST]]
+; AARCH64-NEXT:    br i1 [[CMP7_NOT]], label %[[CLEANUP_LOOPEXIT:.*]], label %[[WHILE_BODY]]
+; AARCH64:       [[CLEANUP_LOOPEXIT]]:
+; AARCH64-NEXT:    br label %[[CLEANUP]]
+; AARCH64:       [[CLEANUP]]:
+; AARCH64-NEXT:    ret void
+;
+; X86_64-LABEL: define void @test_copy_loop(
+; X86_64-SAME: ptr [[THEFIRST:%.*]], ptr [[THELAST:%.*]], ptr [[DEST_BASE:%.*]], ptr [[M_SIZE_PTR:%.*]]) {
+; X86_64-NEXT:  [[ENTRY:.*:]]
+; X86_64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[M_SIZE_PTR]], align 8
+; X86_64-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds nuw i16, ptr [[DEST_BASE]], i64 [[TMP0]]
+; X86_64-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[THEFIRST]], [[THELAST]]
+; X86_64-NEXT:    br i1 [[CMP_NOT]], label %[[CLEANUP:.*]], label %[[WHILE_BODY_PREHEADER:.*]]
+; X86_64:       [[WHILE_BODY_PREHEADER]]:
+; X86_64-NEXT:    br label %[[WHILE_BODY:.*]]
+; X86_64:       [[WHILE_BODY]]:
+; X86_64-NEXT:    [[THEFIRST_ADDR_0112:%.*]] = phi ptr [ [[INCDEC_PTR9:%.*]], %[[WHILE_BODY]] ], [ [[THEFIRST]], %[[WHILE_BODY_PREHEADER]] ]
+; X86_64-NEXT:    [[THEPOINTER_0111:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], %[[WHILE_BODY]] ], [ [[ADD_PTR_I]], %[[WHILE_BODY_PREHEADER]] ]
+; X86_64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[THEFIRST_ADDR_0112]], align 2
+; X86_64-NEXT:    store i16 [[TMP1]], ptr [[THEPOINTER_0111]], align 2
+; X86_64-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[THEPOINTER_0111]], i64 2
+; X86_64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[M_SIZE_PTR]], align 8
+; X86_64-NEXT:    [[INC:%.*]] = add i64 [[TMP2]], 1
+; X86_64-NEXT:    store i64 [[INC]], ptr [[M_SIZE_PTR]], align 8
+; X86_64-NEXT:    [[INCDEC_PTR9]] = getelementptr inbounds nuw i8, ptr [[THEFIRST_ADDR_0112]], i64 2
+; X86_64-NEXT:    [[CMP7_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR9]], [[THELAST]]
+; X86_64-NEXT:    br i1 [[CMP7_NOT]], label %[[CLEANUP_LOOPEXIT:.*]], label %[[WHILE_BODY]]
+; X86_64:       [[CLEANUP_LOOPEXIT]]:
+; X86_64-NEXT:    br label %[[CLEANUP]]
+; X86_64:       [[CLEANUP]]:
+; X86_64-NEXT:    ret void
+;
+
+entry:
+  %0 = load i64, ptr %m_size_ptr, align 8
+  %add.ptr.i = getelementptr inbounds nuw i16, ptr %dest_base, i64 %0
+  %cmp.not = icmp eq ptr %theFirst, %theLast
+  br i1 %cmp.not, label %cleanup, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %theFirst.addr.0112 = phi ptr [ %incdec.ptr9, %while.body ], [ %theFirst, %while.body.preheader ]
+  %thePointer.0111 = phi ptr [ %incdec.ptr, %while.body ], [ %add.ptr.i, %while.body.preheader ]
+  %1 = load i16, ptr %theFirst.addr.0112, align 2
+  store i16 %1, ptr %thePointer.0111, align 2
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr %thePointer.0111, i64 2
+  %2 = load i64, ptr %m_size_ptr, align 8
+  %inc = add i64 %2, 1
+  store i64 %inc, ptr %m_size_ptr, align 8
+  %incdec.ptr9 = getelementptr inbounds nuw i8, ptr %theFirst.addr.0112, i64 2
+  %cmp7.not = icmp eq ptr %incdec.ptr9, %theLast
+  br i1 %cmp7.not, label %cleanup.loopexit, label %while.body
+
+cleanup.loopexit:
+  br label %cleanup
+
+cleanup:
+  ret void
+}

>From e86d5628e58cb0f525184e8dc9c8457c2d18bcd2 Mon Sep 17 00:00:00 2001
From: Felipe Magno de Almeida <felipe at expertise.dev>
Date: Tue, 11 Nov 2025 17:07:02 -0300
Subject: [PATCH 2/3] [LAA] Detect hoistable uniform load/store IV pattern
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces isInvariantLoadHoistable that identifies when a
loop-invariant load can be safely hoisted above the loop using
MemorySSA and ScalarEvolution analysis. It assumes no aliasing or a
alias check.

* Confirms that the load address is loop-invariant
* Searches the loop header for exactly one must-alias load and one
must-alias store to the same memory location.
* Ensures that both are non-volatile and that the MemorySSA clobber
chain between them does not contain conflicting must-alias
definitions.
* Verifies that the store’s value has an invariant SCEV step relative
to the load.
* Ensures that instructions that represent the SCEV doesn't external
users outside this slice.

Will be used to enable selective invariant load hoisting in
LoopVectorize where LAA can precisely prove safety.
---
 .../llvm/Analysis/LoopAccessAnalysis.h        |  28 +-
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 243 +++++++++++++++++-
 llvm/lib/Transforms/Scalar/LoopFlatten.cpp    |   2 +-
 .../Transforms/Scalar/LoopVersioningLICM.cpp  |   3 +-
 .../Vectorize/LoopVectorizationPlanner.h      |   8 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  17 +-
 .../Vectorize/VPlanConstruction.cpp           |  15 +-
 .../Transforms/Vectorize/VPlanTransforms.h    |   2 +-
 .../LoopAccessAnalysis/invalidation.ll        |   1 +
 .../Transforms/Vectorize/VPlanSlpTest.cpp     |   8 +-
 .../Transforms/Vectorize/VPlanTestBase.h      |  20 +-
 11 files changed, 314 insertions(+), 33 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index c85ef3e131068..edec066083abd 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -28,6 +28,7 @@ class DataLayout;
 class Loop;
 class raw_ostream;
 class TargetTransformInfo;
+class MemorySSA;
 
 /// Collection of parameters shared beetween the Loop Vectorizer and the
 /// Loop Access Analysis.
@@ -181,11 +182,12 @@ class MemoryDepChecker {
   };
 
   MemoryDepChecker(PredicatedScalarEvolution &PSE, AssumptionCache *AC,
-                   DominatorTree *DT, const Loop *L,
+                   MemorySSA *MSSA, DominatorTree *DT, AAResults *AA,
+                   const Loop *L,
                    const DenseMap<Value *, const SCEV *> &SymbolicStrides,
                    unsigned MaxTargetVectorWidthInBits,
                    std::optional<ScalarEvolution::LoopGuards> &LoopGuards)
-      : PSE(PSE), AC(AC), DT(DT), InnermostLoop(L),
+      : PSE(PSE), AC(AC), DT(DT), MSSA(MSSA), AA(AA), InnermostLoop(L),
         SymbolicStrides(SymbolicStrides),
         MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits),
         LoopGuards(LoopGuards) {}
@@ -292,6 +294,14 @@ class MemoryDepChecker {
     return PointerBounds;
   }
 
+  /// Return if a Load can be hoisted in this loop with a pattern of a
+  /// memory induction variable. This assumes a alias runtime check
+  /// will be used before hoisting.
+  bool
+  isInvariantLoadHoistable(LoadInst *L, ScalarEvolution &SE, StoreInst **S,
+                           const SCEV **Step,
+                           SmallVectorImpl<Instruction *> *Instructions) const;
+
   DominatorTree *getDT() const {
     assert(DT && "requested DT, but it is not available");
     return DT;
@@ -312,6 +322,8 @@ class MemoryDepChecker {
 
   AssumptionCache *AC;
   DominatorTree *DT;
+  MemorySSA *MSSA;
+  AAResults *AA;
 
   const Loop *InnermostLoop;
 
@@ -692,7 +704,7 @@ class LoopAccessInfo {
                           const TargetTransformInfo *TTI,
                           const TargetLibraryInfo *TLI, AAResults *AA,
                           DominatorTree *DT, LoopInfo *LI, AssumptionCache *AC,
-                          bool AllowPartial = false);
+                          MemorySSA *MSSA, bool AllowPartial = false);
 
   /// Return true we can analyze the memory accesses in the loop and there are
   /// no memory dependence cycles. Note that for dependences between loads &
@@ -786,7 +798,8 @@ class LoopAccessInfo {
   /// Analyze the loop. Returns true if all memory access in the loop can be
   /// vectorized.
   bool analyzeLoop(AAResults *AA, const LoopInfo *LI,
-                   const TargetLibraryInfo *TLI, DominatorTree *DT);
+                   const TargetLibraryInfo *TLI, DominatorTree *DT,
+                   MemorySSA *MSSA);
 
   /// Check if the structure of the loop allows it to be analyzed by this
   /// pass.
@@ -963,12 +976,15 @@ class LoopAccessInfoManager {
   TargetTransformInfo *TTI;
   const TargetLibraryInfo *TLI = nullptr;
   AssumptionCache *AC;
+  MemorySSA *MSSA;
 
 public:
   LoopAccessInfoManager(ScalarEvolution &SE, AAResults &AA, DominatorTree &DT,
                         LoopInfo &LI, TargetTransformInfo *TTI,
-                        const TargetLibraryInfo *TLI, AssumptionCache *AC)
-      : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI), AC(AC) {}
+                        const TargetLibraryInfo *TLI, AssumptionCache *AC,
+                        MemorySSA *MSSA)
+      : SE(SE), AA(AA), DT(DT), LI(LI), TTI(TTI), TLI(TLI), AC(AC), MSSA(MSSA) {
+  }
 
   LLVM_ABI const LoopAccessInfo &getInfo(Loop &L, bool AllowPartial = false);
 
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 5d88e5f54e3d6..34586ed9cd5c7 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -1777,6 +1778,232 @@ bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
   return Diff == 1;
 }
 
+/// Collects all subexpressions that appear within a given SCEV tree.
+struct SCEVSubexprCollector : public SCEVVisitor<SCEVSubexprCollector, void> {
+  SmallPtrSet<const SCEV *, 4> &Subs;
+  SCEVSubexprCollector(SmallPtrSet<const SCEV *, 4> &S) : Subs(S) {}
+
+  template <typename Operands> void visitOperands(Operands operands) {
+    for (auto *Op : operands)
+      visit(Op);
+  }
+  void visitConstant(const SCEVConstant *C) { Subs.insert(C); }
+  void visitUnknown(const SCEVUnknown *U) { Subs.insert(U); }
+  void visitAddExpr(const SCEVAddExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitMulExpr(const SCEVMulExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitAddRecExpr(const SCEVAddRecExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitSMaxExpr(const SCEVSMaxExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitSMinExpr(const SCEVSMinExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitUMinExpr(const SCEVUMinExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitUMaxExpr(const SCEVUMaxExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitMinMaxExpr(const SCEVMinMaxExpr *E) {
+    Subs.insert(E);
+    for (auto *Op : E->operands())
+      visit(Op);
+  }
+  void visitUDivExpr(const SCEVUDivExpr *E) {
+    Subs.insert(E);
+    visit(E->getLHS());
+    visit(E->getRHS());
+  }
+  void visitZeroExtendExpr(const SCEVZeroExtendExpr *E) {
+    Subs.insert(E);
+    visit(E->getOperand());
+  }
+  void visitSignExtendExpr(const SCEVSignExtendExpr *E) {
+    Subs.insert(E);
+    visit(E->getOperand());
+  }
+  void visitTruncateExpr(const SCEVTruncateExpr *E) {
+    Subs.insert(E);
+    visit(E->getOperand());
+  }
+  void visitCouldNotCompute(const SCEVCouldNotCompute *E) { Subs.insert(E); }
+  void visitVScale(const SCEVVScale *E) {
+    Subs.insert(E);
+    visitOperands(E->operands());
+  }
+  void visitPtrToIntExpr(const SCEVPtrToIntExpr *E) {
+    Subs.insert(E);
+    visitOperands(E->operands());
+  }
+  void visitSequentialUMinExpr(const SCEVSequentialUMinExpr *E) {
+    Subs.insert(E);
+    visitOperands(E->operands());
+  }
+};
+
+bool MemoryDepChecker::isInvariantLoadHoistable(
+    LoadInst *L, ScalarEvolution &SE, StoreInst **S, const SCEV **StepSCEV,
+    SmallVectorImpl<Instruction *> *Instructions) const {
+  assert(L != nullptr);
+  assert(InnermostLoop->isLoopInvariant(L->getPointerOperand()));
+
+  if (!MSSA)
+    return false;
+
+  MemoryAccess *MA = MSSA->getMemoryAccess(L);
+  auto QLoc = MemoryLocation::get(L);
+
+  SmallVector<StoreInst *> Stores;
+  SmallVector<LoadInst *> Loads;
+
+  for (auto &&I : *InnermostLoop->getHeader()) {
+    if (auto *Store = dyn_cast<StoreInst>(&I)) {
+      AliasResult AR = AA->alias(MemoryLocation::get(Store), QLoc);
+      if (AR == AliasResult::MustAlias)
+        Stores.push_back(Store);
+    }
+    if (auto *Load = dyn_cast<LoadInst>(&I)) {
+      AliasResult AR = AA->alias(MemoryLocation::get(Load), QLoc);
+      if (AR == AliasResult::MustAlias)
+        Loads.push_back(Load);
+    }
+  }
+
+  if (Loads.size() != 1 || Loads[0]->isVolatile() || Stores.size() != 1 ||
+      Stores[0]->isVolatile())
+    return false;
+
+  // I have the memory PHI, so I know where is the backedge
+  // I have to find all memory accesses to the same cell (that I care)
+  // There should be a single memory use and a single memorydef
+  // memory use should have MemoryPhi as transitive clobber
+  // backedge should have the MemoryDef as a transitive clobber (must-alias) (?)
+  MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(MA);
+  while (auto *MD = dyn_cast<MemoryUseOrDef>(Clobber)) {
+    Instruction *DefI = MD->getMemoryInst();
+
+    if (!DefI)
+      return false;
+
+    AliasResult AR = AA->alias(MemoryLocation::get(DefI), QLoc);
+
+    Clobber = MD->getDefiningAccess();
+
+    // We assume runtime aliasing check will be used
+    if (AR == AliasResult::MustAlias)
+      return false;
+  }
+
+  MemoryAccess *MS = MSSA->getMemoryAccess(Stores[0]);
+  MemoryAccess *StoreClobber = MSSA->getWalker()->getClobberingMemoryAccess(MS);
+  while (true) {
+    if (isa<MemoryPhi>(StoreClobber))
+      break;
+    if (auto *MD = dyn_cast<MemoryUseOrDef>(StoreClobber)) {
+      Instruction *DefI = MD->getMemoryInst();
+
+      if (!DefI)
+        return false;
+
+      AliasResult AR = AA->alias(MemoryLocation::get(DefI), QLoc);
+
+      StoreClobber = MD->getDefiningAccess();
+
+      if (AR == AliasResult::MustAlias)
+        return false;
+    }
+  }
+
+  if (!SE.isSCEVable(Stores[0]->getValueOperand()->getType()))
+    return false;
+
+  const SCEV *LoadSCEV = SE.getUnknown(L);
+  const SCEV *StoreSCEV = SE.getSCEV(Stores[0]->getValueOperand());
+
+  auto Step = SE.getMinusSCEV(StoreSCEV, LoadSCEV);
+
+  if (isa<SCEVCouldNotCompute>(Step) ||
+      !SE.isLoopInvariant(Step, InnermostLoop))
+    return false;
+
+  SmallVector<Instruction *, 4> WL;
+
+  SmallPtrSet<Instruction *, 4> Slice;
+  SmallPtrSet<const SCEV *, 4> Subs;
+  SCEVSubexprCollector Collector(Subs);
+  Collector.visit(StoreSCEV);
+
+  // Register all instructions that matches the SCEV
+  // to allow its removal when hoisting it and
+  // re-expanding the SCEV
+  auto enqueueIfMatches = [&](Value *X) {
+    if (auto *XI = dyn_cast<Instruction>(X)) {
+      const SCEV *SX = SE.getSCEV(XI);
+      if (Subs.contains(SX) && Slice.insert(XI).second)
+        WL.push_back(XI);
+    }
+  };
+
+  enqueueIfMatches(Stores[0]->getValueOperand());
+
+  while (!WL.empty()) {
+    Instruction *I = WL.pop_back_val();
+
+    for (Value *Op : I->operands()) {
+      if (isa<Constant>(Op) || isa<Argument>(Op))
+        continue;
+      enqueueIfMatches(Op);
+    }
+  }
+
+  auto hasExternalUsers =
+      [&Stores](const SmallPtrSetImpl<Instruction *> &Slice) {
+        for (Instruction *I : Slice)
+          for (Use &U : I->uses())
+            if (auto *UserI = dyn_cast<Instruction>(U.getUser())) {
+              if (isa<DbgInfoIntrinsic>(UserI))
+                continue;
+              if (!Slice.count(UserI) &&
+                  !std::count(Stores.begin(), Stores.end(), UserI))
+                return true;
+            }
+        return false;
+      };
+
+  if (hasExternalUsers(Slice))
+    return false;
+
+  if (S)
+    *S = Stores[0];
+  if (StepSCEV)
+    *StepSCEV = Step;
+
+  if (Instructions)
+    Instructions->insert(Instructions->end(), Slice.begin(), Slice.end());
+
+  return true;
+}
+
 void MemoryDepChecker::addAccess(StoreInst *SI) {
   visitPointers(SI->getPointerOperand(), *InnermostLoop,
                 [this, SI](Value *Ptr) {
@@ -2505,7 +2732,7 @@ bool LoopAccessInfo::canAnalyzeLoop() {
 
 bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
                                  const TargetLibraryInfo *TLI,
-                                 DominatorTree *DT) {
+                                 DominatorTree *DT, MemorySSA *MSSA) {
   // Holds the Load and Store instructions.
   SmallVector<LoadInst *, 16> Loads;
   SmallVector<StoreInst *, 16> Stores;
@@ -3064,7 +3291,8 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                const TargetTransformInfo *TTI,
                                const TargetLibraryInfo *TLI, AAResults *AA,
                                DominatorTree *DT, LoopInfo *LI,
-                               AssumptionCache *AC, bool AllowPartial)
+                               AssumptionCache *AC, MemorySSA *MSSA,
+                               bool AllowPartial)
     : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
       PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) {
   unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max();
@@ -3075,11 +3303,12 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
         TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2;
 
   DepChecker = std::make_unique<MemoryDepChecker>(
-      *PSE, AC, DT, L, SymbolicStrides, MaxTargetVectorWidthInBits, LoopGuards);
+      *PSE, AC, MSSA, DT, AA, L, SymbolicStrides, MaxTargetVectorWidthInBits,
+      LoopGuards);
   PtrRtChecking =
       std::make_unique<RuntimePointerChecking>(*DepChecker, SE, LoopGuards);
   if (canAnalyzeLoop())
-    CanVecMem = analyzeLoop(AA, LI, TLI, DT);
+    CanVecMem = analyzeLoop(AA, LI, TLI, DT, MSSA);
 }
 
 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
@@ -3145,7 +3374,7 @@ const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L,
   // or if it was created with a different value of AllowPartial.
   if (Inserted || It->second->hasAllowPartial() != AllowPartial)
     It->second = std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT,
-                                                  &LI, AC, AllowPartial);
+                                                  &LI, AC, MSSA, AllowPartial);
 
   return *It->second;
 }
@@ -3189,7 +3418,9 @@ LoopAccessInfoManager LoopAccessAnalysis::run(Function &F,
   auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
   auto &AC = FAM.getResult<AssumptionAnalysis>(F);
-  return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC);
+  auto &MSSA = FAM.getResult<MemorySSAAnalysis>(F);
+  return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI, &AC,
+                               &MSSA.getMSSA());
 }
 
 AnalysisKey LoopAccessAnalysis::Key;
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 04039b885f3c5..72a2dcb294a57 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -1010,7 +1010,7 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
   // this pass will simplify all loops that contain inner loops,
   // regardless of whether anything ends up being flattened.
   LoopAccessInfoManager LAIM(AR.SE, AR.AA, AR.DT, AR.LI, &AR.TTI, nullptr,
-                             &AR.AC);
+                             &AR.AC, AR.MSSA);
   for (Loop *InnerLoop : LN.getLoops()) {
     auto *OuterLoop = InnerLoop->getParentLoop();
     if (!OuterLoop)
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 3aed643ee8065..e8d5fc870137f 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -549,7 +549,8 @@ PreservedAnalyses LoopVersioningLICMPass::run(Loop &L, LoopAnalysisManager &AM,
   const Function *F = L.getHeader()->getParent();
   OptimizationRemarkEmitter ORE(F);
 
-  LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, &LAR.AC);
+  LoopAccessInfoManager LAIs(*SE, *AA, *DT, LAR.LI, nullptr, nullptr, &LAR.AC,
+                             LAR.MSSA);
   if (!LoopVersioningLICM(AA, SE, &ORE, LAIs, LAR.LI, &L).run(DT))
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 04b05627fa769..99f23afc34694 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -465,6 +465,8 @@ class LoopVectorizationPlanner {
 
   PredicatedScalarEvolution &PSE;
 
+  LoopAccessInfoManager *LAIs;
+
   const LoopVectorizeHints &Hints;
 
   OptimizationRemarkEmitter *ORE;
@@ -498,10 +500,10 @@ class LoopVectorizationPlanner {
       Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
       const TargetTransformInfo &TTI, LoopVectorizationLegality *Legal,
       LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI,
-      PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints,
-      OptimizationRemarkEmitter *ORE)
+      PredicatedScalarEvolution &PSE, LoopAccessInfoManager *LAIs,
+      const LoopVectorizeHints &Hints, OptimizationRemarkEmitter *ORE)
       : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
-        IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {}
+        IAI(IAI), PSE(PSE), LAIs(LAIs), Hints(Hints), ORE(ORE) {}
 
   /// Build VPlans for the specified \p UserVF and \p UserIC if they are
   /// non-zero or all applicable candidate VFs otherwise. If vectorization and
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 89893af5c1140..c8e1bed3e68ee 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8302,7 +8302,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   // candidates built later for specific VF ranges.
   auto VPlan0 = VPlanTransforms::buildVPlan0(
       OrigLoop, *LI, Legal->getWidestInductionType(),
-      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
+      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, LAIs);
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
@@ -8583,7 +8583,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
 
   auto Plan = VPlanTransforms::buildVPlan0(
       OrigLoop, *LI, Legal->getWidestInductionType(),
-      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
+      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, LAIs);
   VPlanTransforms::handleEarlyExits(*Plan,
                                     /*HasUncountableExit*/ false);
   VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
@@ -9114,7 +9114,7 @@ static bool processLoopInVPlanNativePath(
     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
-    LoopVectorizationRequirements &Requirements) {
+    LoopAccessInfoManager *LAIs, LoopVectorizationRequirements &Requirements) {
 
   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
@@ -9132,8 +9132,8 @@ static bool processLoopInVPlanNativePath(
   // Use the planner for outer loop vectorization.
   // TODO: CM is not used at this point inside the planner. Turn CM into an
   // optional argument if we don't need it in the future.
-  LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
-                               ORE);
+  LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, LAIs,
+                               Hints, ORE);
 
   // Get user vectorization factor.
   ElementCount UserVF = Hints.getWidth();
@@ -9846,7 +9846,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // pipeline.
   if (!L->isInnermost())
     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
-                                        ORE, BFI, PSI, Hints, Requirements);
+                                        ORE, BFI, PSI, Hints, LAIs,
+                                        Requirements);
 
   assert(L->isInnermost() && "Inner loop expected.");
 
@@ -9951,8 +9952,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
                                 F, &Hints, IAI, PSI, BFI);
   // Use the planner for vectorization.
-  LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
-                               ORE);
+  LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, LAIs,
+                               Hints, ORE);
 
   // Get user vectorization factor and interleave count.
   ElementCount UserVF = Hints.getWidth();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 4ffd5577d31a4..94015a81955ec 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -40,6 +40,10 @@ class PlainCFGBuilder {
   // Vectorization plan that we are working on.
   std::unique_ptr<VPlan> Plan;
 
+  PredicatedScalarEvolution *PSE;
+
+  LoopAccessInfoManager *LAIs;
+
   // Builder of the VPlan instruction-level representation.
   VPBuilder VPIRBuilder;
 
@@ -65,8 +69,10 @@ class PlainCFGBuilder {
   void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
 
 public:
-  PlainCFGBuilder(Loop *Lp, LoopInfo *LI)
-      : TheLoop(Lp), LI(LI), Plan(std::make_unique<VPlan>(Lp)) {}
+  PlainCFGBuilder(Loop *Lp, LoopInfo *LI, PredicatedScalarEvolution *PSE,
+                  LoopAccessInfoManager *LAIs)
+      : TheLoop(Lp), LI(LI), Plan(std::make_unique<VPlan>(Lp)), PSE(PSE),
+        LAIs(LAIs) {}
 
   /// Build plain CFG for TheLoop and connect it to Plan's entry.
   std::unique_ptr<VPlan> buildPlainCFG();
@@ -537,8 +543,9 @@ static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL,
 
 std::unique_ptr<VPlan>
 VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
-                             DebugLoc IVDL, PredicatedScalarEvolution &PSE) {
-  PlainCFGBuilder Builder(TheLoop, &LI);
+                             DebugLoc IVDL, PredicatedScalarEvolution &PSE,
+                             LoopAccessInfoManager *LAIs) {
+  PlainCFGBuilder Builder(TheLoop, &LI, &PSE, LAIs);
   std::unique_ptr<VPlan> VPlan0 = Builder.buildPlainCFG();
   addInitialSkeleton(*VPlan0, InductionTy, IVDL, PSE, TheLoop);
   return VPlan0;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index e3bde8a47dcbc..f3824e9cd2d5f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -99,7 +99,7 @@ struct VPlanTransforms {
   ///      >[ ]     <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
   LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
   buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
-              PredicatedScalarEvolution &PSE);
+              PredicatedScalarEvolution &PSE, LoopAccessInfoManager *LAIs);
 
   /// Update \p Plan to account for all early exits.
   LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan,
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/invalidation.ll b/llvm/test/Analysis/LoopAccessAnalysis/invalidation.ll
index fb3af609dd2c6..d40e238df3a42 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/invalidation.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/invalidation.ll
@@ -11,6 +11,7 @@
 ; CHECK-AA-NEXT: Running analysis: LoopAccessAnalysis on foo
 ; CHECK-AA: Running pass: InvalidateAnalysisPass
 ; CHECK-AA-NEXT: Invalidating analysis: AAManager on foo
+; CHECK-AA-NEXT: Invalidating analysis: MemorySSAAnalysis on foo
 ; CHECK-AA-NEXT: Invalidating analysis: LoopAccessAnalysis on foo
 ; CHECK-AA-NEXT: Running pass: LoopAccessInfoPrinterPass on foo
 ; CHECK-AA-NEXT: Running analysis: LoopAccessAnalysis on foo
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
index 7471355603640..cdbfbee59743a 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -27,6 +27,8 @@ class VPlanSlpTest : public VPlanTestIRBase {
   std::unique_ptr<LoopAccessInfo> LAI;
   std::unique_ptr<PredicatedScalarEvolution> PSE;
   std::unique_ptr<InterleavedAccessInfo> IAI;
+  std::unique_ptr<TargetTransformInfo> TTI;
+  std::unique_ptr<MemorySSA> MSSA;
 
   VPlanSlpTest()
       : DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-"
@@ -41,8 +43,10 @@ class VPlanSlpTest : public VPlanTestIRBase {
     AARes.reset(new AAResults(*TLI));
     AARes->addAAResult(*BasicAA);
     PSE.reset(new PredicatedScalarEvolution(*SE, *L));
-    LAI.reset(
-        new LoopAccessInfo(L, &*SE, nullptr, &*TLI, &*AARes, &*DT, &*LI, &*AC));
+    TTI = std::make_unique<TargetTransformInfo>(DL);
+    MSSA.reset(new MemorySSA(F, &*AARes, &*DT));
+    LAI.reset(new LoopAccessInfo(L, &*SE, nullptr, &*TLI, &*AARes, &*DT, &*LI,
+                                 &*AC, &*MSSA));
     IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI));
     IAI->analyzeInterleaving(false);
     return {Plan, *IAI};
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
index ed6e13b4add3d..7c6c2b4cdc54b 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
@@ -18,7 +18,10 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfoImpl.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Verifier.h"
@@ -27,6 +30,11 @@
 
 namespace llvm {
 
+struct TargetTransformInfoImpl : TargetTransformInfoImplBase {
+  TargetTransformInfoImpl(const DataLayout &DL)
+      : TargetTransformInfoImplBase(DL) {}
+};
+
 /// Helper class to create a module from an assembly string and VPlans for a
 /// given loop entry block.
 class VPlanTestIRBase : public testing::Test {
@@ -41,6 +49,11 @@ class VPlanTestIRBase : public testing::Test {
   std::unique_ptr<ScalarEvolution> SE;
   std::unique_ptr<TargetLibraryInfoImpl> TLII;
   std::unique_ptr<TargetLibraryInfo> TLI;
+  std::unique_ptr<TargetTransformInfoImplBase> TTII;
+  std::unique_ptr<TargetTransformInfo> TTI;
+  std::unique_ptr<AAResults> AA;
+  std::unique_ptr<MemorySSA> MSSA;
+  std::unique_ptr<LoopAccessInfoManager> LAIs;
 
   VPlanTestIRBase()
       : DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-"
@@ -54,6 +67,7 @@ class VPlanTestIRBase : public testing::Test {
     EXPECT_TRUE(M);
     TLII = std::make_unique<TargetLibraryInfoImpl>(M->getTargetTriple());
     TLI = std::make_unique<TargetLibraryInfo>(*TLII);
+    TTI = std::make_unique<TargetTransformInfo>(DL);
     return *M;
   }
 
@@ -62,6 +76,10 @@ class VPlanTestIRBase : public testing::Test {
     LI.reset(new LoopInfo(*DT));
     AC.reset(new AssumptionCache(F));
     SE.reset(new ScalarEvolution(F, *TLI, *AC, *DT, *LI));
+    AA.reset(new AAResults(*TLI));
+    MSSA.reset(new MemorySSA(F, &*AA, &*DT));
+    LAIs.reset(new LoopAccessInfoManager(*SE, *AA, *DT, *LI, &*TTI, &*TLI, &*AC,
+                                         &*MSSA));
   }
 
   /// Build the VPlan for the loop starting from \p LoopHeader.
@@ -73,7 +91,7 @@ class VPlanTestIRBase : public testing::Test {
     Loop *L = LI->getLoopFor(LoopHeader);
     PredicatedScalarEvolution PSE(*SE, *L);
     auto Plan = VPlanTransforms::buildVPlan0(L, *LI, IntegerType::get(*Ctx, 64),
-                                             {}, PSE);
+                                             {}, PSE, LAIs.get());
 
     VPlanTransforms::handleEarlyExits(*Plan, HasUncountableExit);
     VPlanTransforms::addMiddleCheck(*Plan, true, false);

>From cf0df88c44f4a71a734c0a88da8bb53b3d00f858 Mon Sep 17 00:00:00 2001
From: Felipe Magno de Almeida <felipe at expertise.dev>
Date: Wed, 12 Nov 2025 14:22:03 -0300
Subject: [PATCH 3/3] [LoopVectorize][LAA] Hoist load in memory IV to allow
 vectorization

Adds VPScalarIVPromotionRecipe recipe to promote memory IV to scalar
IV.

Use SCEV mutilplied by VFxUF or EVL to multiply the SCEV between load
and store.

The code that this patch allows vectorization is like this:

while.body:
  %theFirst.addr.0112 = phi ptr [ %incdec.ptr9, %while.body ], [ %theFirst, %while.body.preheader ]
  %thePointer.0111 = phi ptr [ %incdec.ptr, %while.body ], [ %add.ptr.i, %while.body.preheader ]
  %1 = load i16, ptr %theFirst.addr.0112, align 2
  store i16 %1, ptr %thePointer.0111, align 2
  %incdec.ptr = getelementptr inbounds nuw i8, ptr %thePointer.0111, i64 2
  %2 = load i64, ptr %m_size_ptr, align 8
  %inc = add i64 %2, 1
  store i64 %inc, ptr %m_size_ptr, align 8
  %incdec.ptr9 = getelementptr inbounds nuw i8, ptr %theFirst.addr.0112, i64 2
  %cmp7.not = icmp eq ptr %incdec.ptr9, %theLast
  br i1 %cmp7.not, label %cleanup.loopexit, label %while.body

As you can see, %m_size_ptr is a loop invariant pointer and can be
promoted to a IV scalar and then execute vectorization.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |  25 ++++-
 llvm/lib/Transforms/Scalar/LoopDistribute.cpp |   6 +-
 .../Vectorize/LoopVectorizationPlanner.h      |   2 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |  36 ++++++-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  58 +++++++++-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   8 +-
 .../Vectorize/VPlanConstruction.cpp           | 100 +++++++++++++++++-
 .../Transforms/Vectorize/VPlanPatternMatch.h  |  15 +++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  55 +++++++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  45 +++++++-
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp |   3 +-
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |   5 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   1 +
 .../Transforms/Vectorize/VPlanVerifier.cpp    |   4 +
 .../invariant-dependence-before.ll            |  53 +++++-----
 .../AArch64/conditional-branches-cost.ll      |   2 +-
 .../AArch64/partial-reduce-dot-product.ll     |   2 +-
 .../LoopVectorize/AArch64/partial-reduce.ll   |   4 +-
 .../AArch64/reduction-recurrence-costs-sve.ll |   4 +-
 .../AArch64/scalable-avoid-scalarization.ll   |   4 +-
 .../AArch64/scalable-strict-fadd.ll           |   8 +-
 .../AArch64/sve-interleaved-accesses.ll       |  24 ++---
 .../AArch64/sve-live-out-pointer-induction.ll |   4 +-
 .../LoopVectorize/AArch64/sve-multiexit.ll    |   8 +-
 .../sve-runtime-check-size-based-threshold.ll |   2 +-
 .../LoopVectorize/AArch64/sve-widen-gep.ll    |   2 +-
 .../LoopVectorize/AArch64/sve-widen-phi.ll    |   2 +-
 .../AArch64/tail-folding-styles.ll            |   2 +-
 ...row-interleave-to-widen-memory-scalable.ll |   4 +-
 .../RISCV/tail-folding-interleave.ll          |   4 +-
 .../LoopVectorize/X86/uniform_mem_op.ll       |  17 ++-
 .../X86/vectorize-interleaved-accesses-gap.ll |  55 +---------
 .../LoopVectorize/memory-iv-promotion.ll      |  66 +++---------
 .../LoopVectorize/pointer-induction.ll        |   6 +-
 .../reuse-lcssa-phi-scev-expansion.ll         |  12 +--
 .../scalable-first-order-recurrence.ll        |  24 ++---
 .../LoopVectorize/vplan-printing.ll           |   4 +-
 37 files changed, 463 insertions(+), 213 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 34586ed9cd5c7..4a8871ddbb7eb 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2329,6 +2329,19 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   int64_t StrideBPtrInt = *StrideBPtr;
   LLVM_DEBUG(dbgs() << "LAA:  Src induction step: " << StrideAPtrInt
                     << " Sink induction step: " << StrideBPtrInt << "\n");
+
+  if (!StrideAPtrInt && !StrideBPtrInt && !(AIsWrite && BIsWrite) &&
+      (AIsWrite || BIsWrite) && !isa<UndefValue>(APtr) &&
+      InnermostLoop->isLoopInvariant(APtr) &&
+      InnermostLoop->isLoopInvariant(BPtr)) {
+    LoadInst *L = dyn_cast<LoadInst>(AIsWrite ? BInst : AInst);
+    if (InnermostLoop->isLoopInvariant(L->getPointerOperand()))
+      if (L && isInvariantLoadHoistable(L, SE, nullptr, nullptr, nullptr))
+        ShouldRetryWithRuntimeChecks = true;
+
+    return MemoryDepChecker::Dependence::Unknown;
+  }
+
   // At least Src or Sink are loop invariant and the other is strided or
   // invariant. We can generate a runtime check to disambiguate the accesses.
   if (!StrideAPtrInt || !StrideBPtrInt)
@@ -2942,9 +2955,15 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
     // See if there is an unsafe dependency between a load to a uniform address and
     // store to the same uniform address.
     if (UniformStores.contains(Ptr)) {
-      LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
-                           "load and uniform store to the same address!\n");
-      HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
+      auto &SE = *PSE->getSE();
+      if (TheLoop->isLoopInvariant(LD->getPointerOperand()) &&
+          !getDepChecker().isInvariantLoadHoistable(LD, SE, nullptr, nullptr,
+                                                    nullptr)) {
+        LLVM_DEBUG(
+            dbgs() << "LAA: Found an unsafe dependency between a uniform "
+                      "load and uniform store to the same address!\n");
+        HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
+      }
     }
 
     MemoryLocation Loc = MemoryLocation::get(LD);
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 0c8b9043fcbbb..ebda2c96b75e6 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -680,12 +680,14 @@ class LoopDistributeForLoop {
 
     // Currently, we only distribute to isolate the part of the loop with
     // dependence cycles to enable partial vectorization.
-    if (LAI->canVectorizeMemory())
+    if (!LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress() &&
+        LAI->canVectorizeMemory())
       return fail("MemOpsCanBeVectorized",
                   "memory operations are safe for vectorization");
 
     auto *Dependences = LAI->getDepChecker().getDependences();
-    if (!Dependences || Dependences->empty())
+    if (!LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress() &&
+        (!Dependences || Dependences->empty()))
       return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
 
     LLVM_DEBUG(dbgs() << "LDist: Found a candidate loop: "
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 99f23afc34694..0ffabdda21bdf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -630,6 +630,8 @@ class LoopVectorizationPlanner {
                                   VPRecipeBuilder &RecipeBuilder,
                                   ElementCount MinVF);
 
+  void adjustScalarIVPromotions(VPlanPtr &Plan);
+
   /// Attach the runtime checks of \p RTChecks to \p Plan.
   void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
                            bool HasBranchWeights) const;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c8e1bed3e68ee..c683e1df8664b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4092,6 +4092,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       case VPDef::VPEVLBasedIVPHISC:
       case VPDef::VPPredInstPHISC:
       case VPDef::VPBranchOnMaskSC:
+      case VPDef::VPScalarIVPromotionRecipeSC:
         continue;
       case VPDef::VPReductionSC:
       case VPDef::VPActiveLaneMaskPHISC:
@@ -7523,6 +7524,14 @@ BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
   OriginalScalarPH->setName("vec.epilog.iter.check");
   VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH);
   VPBasicBlock *OldEntry = Plan.getEntry();
+
+  for (VPRecipeBase &R : make_early_inc_range(*OldEntry))
+    // Move hoisted loads to split PreHeader
+    if (auto RepR = dyn_cast<VPReplicateRecipe>(&R)) {
+      RepR->removeFromParent();
+      VectorPHVPBB->appendRecipe(RepR);
+    }
+
   for (auto &R : make_early_inc_range(*OldEntry)) {
     // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
     // defining.
@@ -7532,6 +7541,7 @@ BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
   }
 
   VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
+
   Plan.setEntry(NewEntry);
   // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
 
@@ -8324,6 +8334,23 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   }
 }
 
+void LoopVectorizationPlanner::adjustScalarIVPromotions(VPlanPtr &Plan) {
+  VPScalarIVPromotionRecipe *Recipe = nullptr;
+
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_deep(Plan->getVectorLoopRegion())))
+    for (VPRecipeBase &R : *VPBB)
+      if (auto *ScalarIV = dyn_cast<VPScalarIVPromotionRecipe>(&R)) {
+        assert(!Recipe && "Only one FFLoad is supported");
+        Recipe = ScalarIV;
+      }
+
+  if (!Recipe)
+    return;
+
+  Recipe->setVFxUF(&Plan->getVFxUF());
+}
+
 VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
 
@@ -8434,11 +8461,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
       // latter are added above for masking.
       // FIXME: Migrate code relying on the underlying instruction from VPlan0
       // to construct recipes below to not use the underlying instruction.
-      if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
-              &R) ||
+      if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe,
+              VPScalarIVPromotionRecipe>(&R) ||
           (isa<VPInstruction>(&R) && !UnderlyingValue))
         continue;
-      assert(isa<VPInstruction>(&R) && UnderlyingValue && "unsupported recipe");
+      assert((isa<VPInstruction, VPReplicateRecipe>(&R) && UnderlyingValue &&
+              "unsupported recipe"));
 
       // TODO: Gradually replace uses of underlying instruction by analyses on
       // VPlan.
@@ -8514,6 +8542,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // Adjust the recipes for any inloop reductions.
   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
 
+  adjustScalarIVPromotions(Plan);
+
   // Apply mandatory transformation to handle FP maxnum/minnum reduction with
   // NaNs if possible, bail out otherwise.
   if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 092114db95e9c..e650dd9fdf65e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -554,6 +554,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenPointerInductionSC:
     case VPRecipeBase::VPReductionPHISC:
     case VPRecipeBase::VPPartialReductionSC:
+    case VPRecipeBase::VPScalarIVPromotionRecipeSC:
       return true;
     case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPInterleaveEVLSC:
@@ -580,10 +581,12 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
 
   /// Returns the underlying instruction.
   Instruction *getUnderlyingInstr() {
-    return cast<Instruction>(getUnderlyingValue());
+    return getUnderlyingValue() ? dyn_cast<Instruction>(getUnderlyingValue())
+                                : nullptr;
   }
   const Instruction *getUnderlyingInstr() const {
-    return cast<Instruction>(getUnderlyingValue());
+    return getUnderlyingValue() ? dyn_cast<Instruction>(getUnderlyingValue())
+                                : nullptr;
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2312,7 +2315,8 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
 
   VPFirstOrderRecurrencePHIRecipe *clone() override {
     return new VPFirstOrderRecurrencePHIRecipe(
-        cast<PHINode>(getUnderlyingInstr()), *getOperand(0));
+        getUnderlyingInstr() ? cast<PHINode>(getUnderlyingInstr()) : nullptr,
+        *getOperand(0));
   }
 
   void execute(VPTransformState &State) override;
@@ -3475,6 +3479,54 @@ class VPExpandSCEVRecipe : public VPSingleDefRecipe {
   const SCEV *getSCEV() const { return Expr; }
 };
 
+struct LLVM_ABI_FOR_TEST VPScalarIVPromotionRecipe : public VPSingleDefRecipe {
+  VPScalarIVPromotionRecipe(std::initializer_list<VPValue *> Operands,
+                            DebugLoc DL = DebugLoc::getUnknown())
+      : VPSingleDefRecipe(VPDef::VPScalarIVPromotionRecipeSC, Operands, DL) {}
+
+  VP_CLASSOF_IMPL(VPDef::VPScalarIVPromotionRecipeSC)
+
+  bool isSingleScalar() const { return true; }
+
+  VPScalarIVPromotionRecipe *clone() override {
+    assert(getNumOperands() == 3 || getNumOperands() == 4);
+    if (getNumOperands() == 3)
+      return new VPScalarIVPromotionRecipe(
+          {getOperand(0), getOperand(1), getOperand(2)}, getDebugLoc());
+    return new VPScalarIVPromotionRecipe(
+        {getOperand(0), getOperand(1), getOperand(2), getOperand(3)},
+        getDebugLoc());
+  }
+
+  VPValue *getVFxUF() { return getOperand(3); }
+  void setVFxUF(VPValue *V) {
+    if (getNumOperands() == 3) {
+      addOperand(V);
+    } else {
+      setOperand(3, V);
+    }
+  }
+
+  void execute(VPTransformState &State) override;
+
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override {
+    return 0;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  bool usesScalars(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    return true;
+  }
+};
+
 /// Canonical scalar induction phi of the vector loop. Starting at the specified
 /// start value (either 0 or the resume value when vectorizing the epilogue
 /// loop). VPWidenCanonicalIVRecipe represents the vector version of the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 80a2e4bc3f754..8ec552b9145c5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -136,6 +136,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
     return inferScalarType(R->getOperand(0));
   case VPInstruction::BranchOnCond:
   case VPInstruction::BranchOnCount:
+  case Instruction::Store:
     return Type::getVoidTy(Ctx);
   default:
     break;
@@ -289,9 +290,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
           .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
                 VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
                 VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
-                VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
-            return inferScalarType(R->getOperand(0));
-          })
+                VPPartialReductionRecipe, VPScalarIVPromotionRecipe>(
+              [this](const VPRecipeBase *R) {
+                return inferScalarType(R->getOperand(0));
+              })
           // VPInstructionWithType must be handled before VPInstruction.
           .Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
                 VPWidenCastRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 94015a81955ec..cad73154646f1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -29,6 +29,15 @@ using namespace llvm;
 using namespace VPlanPatternMatch;
 
 namespace {
+
+struct ScalarPromotionInfo {
+  LoadInst *Load;
+  StoreInst *Store;
+  const SCEV *Step = nullptr;
+
+  SmallVector<Instruction *, 1> Instructions;
+};
+
 // Class that is used to build the plain CFG for the incoming IR.
 class PlainCFGBuilder {
   // The outermost loop of the input loop nest considered for vectorization.
@@ -58,6 +67,8 @@ class PlainCFGBuilder {
   // Hold phi node's that need to be fixed once the plain CFG has been built.
   SmallVector<PHINode *, 8> PhisToFix;
 
+  SmallVector<ScalarPromotionInfo, 2> ScalarPromotions;
+
   // Utility functions.
   void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
   void fixHeaderPhis();
@@ -68,6 +79,8 @@ class PlainCFGBuilder {
   VPValue *getOrCreateVPOperand(Value *IRVal);
   void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
 
+  void analyzeScalarPromotion(VPBasicBlock *VPBB, BasicBlock *BB);
+
 public:
   PlainCFGBuilder(Loop *Lp, LoopInfo *LI, PredicatedScalarEvolution *PSE,
                   LoopAccessInfoManager *LAIs)
@@ -172,11 +185,36 @@ VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
   return NewVPVal;
 }
 
+void PlainCFGBuilder::analyzeScalarPromotion(VPBasicBlock *VPBB,
+                                             BasicBlock *BB) {
+  for (Instruction &InstRef : BB->instructionsWithoutDebug(false)) {
+    Instruction *Inst = &InstRef;
+
+    if (auto *Load = dyn_cast<LoadInst>(Inst)) {
+      auto Loop = LI->getLoopFor(Inst->getParent());
+      auto &LAI = LAIs->getInfo(*Loop);
+      StoreInst *Store = nullptr;
+      const SCEV *Step = nullptr;
+
+      if (Loop->isLoopInvariant(Load->getPointerOperand())) {
+        SmallVector<Instruction *, 4> Is;
+        if (LAI.getDepChecker().isInvariantLoadHoistable(Load, *PSE->getSE(),
+                                                         &Store, &Step, &Is)) {
+          ScalarPromotions.push_back(ScalarPromotionInfo{Load, Store, Step});
+          ScalarPromotions.back().Instructions.insert(
+              ScalarPromotions.back().Instructions.end(), Is.begin(), Is.end());
+        }
+      }
+    }
+  }
+}
+
 // Create new VPInstructions in a VPBasicBlock, given its BasicBlock
 // counterpart. This function must be invoked in RPO so that the operands of a
 // VPInstruction in \p BB have been visited before (except for Phi nodes).
 void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
                                                   BasicBlock *BB) {
+  DenseSet<Value *> SkipInsts;
   VPIRBuilder.setInsertPoint(VPBB);
   // TODO: Model and preserve debug intrinsics in VPlan.
   for (Instruction &InstRef : BB->instructionsWithoutDebug(false)) {
@@ -234,6 +272,46 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
               VPPredToIncomingValue.lookup(Pred->getExitingBasicBlock()));
       }
     } else {
+      auto skip = false;
+      for (auto &SP : ScalarPromotions) {
+        if (Inst == SP.Load) {
+          VPBasicBlock *PreheaderVPBB =
+              Plan->getVectorPreheader(); // vector preheader, not the IR loop
+                                          // preheader
+          if (!PreheaderVPBB)
+            PreheaderVPBB = Plan->getEntry();
+
+          SmallVector<VPValue *, 4> VPOperands;
+          for (Value *Op : SP.Load->operands()) {
+            VPOperands.push_back(getOrCreateVPOperand(Op));
+          }
+          auto *Load = new VPReplicateRecipe(SP.Load, VPOperands,
+                                             /*IsSingleScalar=*/true);
+          auto SCEVRecipe = new VPExpandSCEVRecipe(SP.Step);
+          PreheaderVPBB->appendRecipe(SCEVRecipe);
+          PreheaderVPBB->appendRecipe(Load);
+
+          auto StepValue = SCEVRecipe->getVPSingleValue();
+
+          NewR = new VPScalarIVPromotionRecipe(
+              {Load, StepValue,
+               getOrCreateVPOperand(SP.Store->getPointerOperand())},
+              SP.Load->getDebugLoc());
+          VPBB->appendRecipe(NewR);
+          skip = true;
+          break;
+        } else if (Inst == SP.Store) {
+          skip = true;
+          break;
+        } else if (Inst == SP.Instructions[0]) {
+          skip = true;
+          break;
+        }
+      }
+
+      if (skip)
+        continue;
+
       // Translate LLVM-IR operands into VPValue operands and set them in the
       // new VPInstruction.
       SmallVector<VPValue *, 4> VPOperands;
@@ -280,6 +358,8 @@ std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() {
     IRDef2VPValue[&I] = Plan->getOrAddLiveIn(&I);
   }
 
+  // dbgs() << "ECHO 9.1 "; Plan->dump();
+
   LoopBlocksRPO RPO(TheLoop);
   RPO.perform(LI);
 
@@ -289,6 +369,8 @@ std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() {
     // Set VPBB predecessors in the same order as they are in the incoming BB.
     setVPBBPredsFromBB(VPBB, BB);
 
+    analyzeScalarPromotion(VPBB, BB);
+
     // Create VPInstructions for BB.
     createVPInstructionsForVPBB(VPBB, BB);
 
@@ -733,8 +815,11 @@ void VPlanTransforms::addMinimumIterationCheck(
       // Don't execute the vector loop if (UMax - n) < (VF * UF).
       // FIXME: Should only check VF * UF, but currently checks Step=max(VF*UF,
       // minProfitableTripCount).
-      TripCountCheck = Builder.createICmp(ICmpInst::ICMP_ULT, DistanceToMax,
-                                          Builder.createExpandSCEV(Step), DL);
+      TripCountCheck =
+          Builder.createICmp(ICmpInst::ICMP_ULT, DistanceToMax,
+                             VPBuilder(EntryVPBB, EntryVPBB->getFirstNonPhi())
+                                 .createExpandSCEV(Step),
+                             DL);
     } else {
       // TripCountCheck = false, folding tail implies positive vector trip
       // count.
@@ -752,7 +837,9 @@ void VPlanTransforms::addMinimumIterationCheck(
                                     TripCount, Step)) {
       // Generate the minimum iteration check only if we cannot prove the
       // check is known to be true, or known to be false.
-      VPValue *MinTripCountVPV = Builder.createExpandSCEV(Step);
+      VPValue *MinTripCountVPV =
+          VPBuilder(EntryVPBB, EntryVPBB->getFirstNonPhi())
+              .createExpandSCEV(Step);
       TripCountCheck = Builder.createICmp(
           CmpPred, TripCountVPV, MinTripCountVPV, DL, "min.iters.check");
     } // else step known to be < trip count, use TripCountCheck preset to false.
@@ -774,8 +861,11 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
   // Add the minimum iteration check for the epilogue vector loop.
   VPValue *TC = Plan.getOrAddLiveIn(TripCount);
   VPBuilder Builder(cast<VPBasicBlock>(Plan.getEntry()));
-  VPValue *VFxUF = Builder.createExpandSCEV(SE.getElementCount(
-      TripCount->getType(), (EpilogueVF * EpilogueUF), SCEV::FlagNUW));
+  VPValue *VFxUF =
+      VPBuilder(cast<VPBasicBlock>(Plan.getEntry()),
+                cast<VPBasicBlock>(Plan.getEntry())->getFirstNonPhi())
+          .createExpandSCEV(SE.getElementCount(
+              TripCount->getType(), (EpilogueVF * EpilogueUF), SCEV::FlagNUW));
   VPValue *Count = Builder.createNaryOp(
       Instruction::Sub, {TC, Plan.getOrAddLiveIn(VectorTripCount)},
       DebugLoc::getUnknown(), "n.vec.remaining");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 47d8cc260511e..6f1029d545307 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -92,6 +92,17 @@ struct deferredval_ty {
 /// whichever value m_VPValue(X) populated.
 inline deferredval_ty m_Deferred(VPValue *const &V) { return V; }
 
+template <typename SubT> struct RecipeBindValue {
+  SubT Sub;
+  VPValue *&Out;
+  bool match(const VPValue *V) const {
+    if (!Sub.match(V))
+      return false;
+    Out = const_cast<VPValue *>(V);
+    return true;
+  }
+};
+
 /// Match an integer constant or vector of constants if Pred::isValue returns
 /// true for the APInt. \p BitWidth optionally specifies the bitwidth the
 /// matched constant must have. If it is 0, the matched constant can have any
@@ -306,6 +317,10 @@ struct Recipe_match {
            });
   }
 
+  auto bind(VPValue *&Out) const & {
+    return RecipeBindValue<decltype(*this)>{*this, Out};
+  }
+
 private:
   template <typename RecipeTy>
   static bool matchRecipeAndOpcode(const VPRecipeBase *R) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f5528ab7b2bbe..2abfe8843c470 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <cassert>
 
 using namespace llvm;
@@ -672,6 +673,12 @@ Value *VPInstruction::generate(VPTransformState &State) {
   case Instruction::PHI: {
     llvm_unreachable("should be handled by VPPhi::execute");
   }
+  case Instruction::Store: {
+    assert(vputils::onlyFirstLaneUsed(this) && "Should be scalar store");
+    Value *V = State.get(getOperand(0), true);
+    Value *P = State.get(getOperand(1), true);
+    return Builder.CreateStore(V, P);
+  }
   case Instruction::Select: {
     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
     Value *Cond =
@@ -1293,7 +1300,8 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
 
 bool VPInstruction::usesFirstLaneOnly(const VPValue *Op) const {
   assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
-  if (Instruction::isBinaryOp(getOpcode()) || Instruction::isCast(getOpcode()))
+  if (Instruction::isBinaryOp(getOpcode()) ||
+      Instruction::isCast(getOpcode()) || getOpcode() == Instruction::Store)
     return vputils::onlyFirstLaneUsed(this);
 
   switch (getOpcode()) {
@@ -1546,7 +1554,52 @@ void VPPhi::execute(VPTransformState &State) {
   State.set(this, NewPhi, VPLane(0));
 }
 
+void VPScalarIVPromotionRecipe::execute(VPTransformState &State) {
+  auto &Builder = State.Builder;
+  State.setDebugLocFrom(getDebugLoc());
+
+  Value *VL = State.get(getVFxUF(), VPLane(0));
+  Type *Ty = State.get(getOperand(0), VPLane(0))->getType();
+  VL = Builder.CreateZExtOrTrunc(VL, Ty);
+
+  auto PhiInsertPoint =
+      State.CFG.VPBB2IRBB[getParent()->getExitingBasicBlock()]
+          ->getFirstNonPHIIt();
+  auto DefaultInsertPoint = State.Builder.GetInsertPoint();
+
+  State.Builder.SetInsertPoint(PhiInsertPoint);
+  auto Phi = Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, "");
+  State.Builder.SetInsertPoint(DefaultInsertPoint);
+  auto EntryValue = State.get(getOperand(0), VPLane(0));
+  VPBlockBase *Pred = getParent()->getPredecessors()[0];
+  auto *PredVPBB = Pred->getExitingBasicBlock();
+  Phi->addIncoming(EntryValue, State.CFG.VPBB2IRBB[PredVPBB]);
+
+  auto SCEVStep = State.get(getOperand(1), VPLane(0));
+  SCEVStep = Builder.CreateZExtOrTrunc(SCEVStep, Ty);
+
+  auto Mul = Builder.CreateNAryOp(Instruction::Mul, {SCEVStep, VL});
+  auto Add = Builder.CreateNAryOp(Instruction::Add, {Phi, Mul});
+
+  auto Pointer = State.get(getOperand(2), VPLane(0));
+  Builder.CreateStore(Add, Pointer);
+
+  Phi->addIncoming(Add, dyn_cast<Instruction>(Add)->getParent());
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPScalarIVPromotionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                      VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
+  printAsOperand(O, SlotTracker);
+  O << " = Scalar Promotion IV ";
+  printOperands(O, SlotTracker);
+
+  if (auto DL = getDebugLoc()) {
+    O << ", !dbg ";
+    DL.print(O);
+  }
+}
 void VPPhi::print(raw_ostream &O, const Twine &Indent,
                   VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT" << (isSingleScalar() ? "-SCALAR" : "") << " ";
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 382521d090f4e..bb4481ba9ec71 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2714,14 +2714,16 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
                   return match(U,
                                m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
                                        m_Specific(&Plan.getVFxUF()))) ||
-                         isa<VPWidenPointerInductionRecipe>(U);
+                         isa<VPWidenPointerInductionRecipe>(U) ||
+                         isa<VPScalarIVPromotionRecipe>(U);
                 }) &&
          "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
          "increment of the canonical induction.");
   Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
     // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
     // canonical induction must not be updated.
-    return isa<VPWidenPointerInductionRecipe>(U);
+    return isa<VPWidenPointerInductionRecipe>(U) ||
+           isa<VPScalarIVPromotionRecipe>(U);
   });
 
   // Defer erasing recipes till the end so that we don't invalidate the
@@ -2946,9 +2948,10 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
   VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
   VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
   VPValue *AVL;
+  VPValue *EVL;
   [[maybe_unused]] bool FoundAVL =
-      match(EVLIncrement,
-            m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
+      match(EVLIncrement, m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL)).bind(EVL)),
+                                  m_Specific(EVLPhi)));
   assert(FoundAVL && "Didn't find AVL?");
 
   // The AVL may be capped to a safe distance.
@@ -3004,6 +3007,40 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
                                     Plan.getConstantInt(AVLTy, 0));
   Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
   LatchExitingBr->eraseFromParent();
+
+  SmallVector<VPRecipeBase *> RecipesToErase;
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+           vp_depth_first_shallow(Plan.getEntry()))) {
+    for (VPRecipeBase &R : *VPBB)
+      if (auto *ScalarIV = dyn_cast<VPScalarIVPromotionRecipe>(&R)) {
+        auto ScalarTy =
+            VPTypeAnalysis(Plan).inferScalarType(ScalarIV->getOperand(1));
+        auto EVLTy = VPTypeAnalysis(Plan).inferScalarType(EVL);
+        auto CompEVL = VPBuilder(ScalarIV).createScalarZExtOrTrunc(
+            EVL, ScalarTy, EVLTy, ScalarIV->getDebugLoc());
+
+        auto Phi = VPBuilder(VPBB, VPBB->getFirstNonPhi())
+                       .createScalarPhi({ScalarIV->getOperand(0)},
+                                        ScalarIV->getDebugLoc());
+
+        auto Mul = VPBuilder(ScalarIV).createNaryOp(
+            Instruction::Mul, {ScalarIV->getOperand(1), CompEVL});
+        auto Add =
+            VPBuilder(ScalarIV).createNaryOp(Instruction::Add, {Phi, Mul});
+
+        VPBuilder(ScalarIV).createNaryOp(Instruction::Store,
+                                         {Add, ScalarIV->getOperand(2)});
+
+        Phi->addOperand(Add);
+
+        ScalarIV->replaceAllUsesWith(Add);
+        RecipesToErase.push_back(ScalarIV);
+      }
+  }
+
+  for (auto &Recipe : RecipesToErase) {
+    Recipe->eraseFromParent();
+  }
 }
 
 void VPlanTransforms::replaceSymbolicStrides(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index d4b8b72beb942..c632afde5fc03 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -344,7 +344,8 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) {
   auto *VPBB = cast<VPBasicBlock>(VPB);
   auto InsertPtForPhi = VPBB->getFirstNonPhi();
   for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
-    if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R))
+    if (ToSkip.contains(&R) || isa<VPIRInstruction>(&R) ||
+        isa<VPScalarIVPromotionRecipe>(&R))
       continue;
 
     // Add all VPValues for all parts to AnyOf, FirstActiveLaneMask and
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index e22c5dfdb9f38..8228a7cbd434f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -42,7 +42,10 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
   if (U && !isa<Instruction>(U->getValue()))
     return Plan.getOrAddLiveIn(U->getValue());
   auto *Expanded = new VPExpandSCEVRecipe(Expr);
-  Plan.getEntry()->appendRecipe(Expanded);
+  auto Iterator = Plan.getEntry()->begin();
+  while (Iterator != Plan.getEntry()->end() && Iterator->isPhi())
+    ++Iterator;
+  Plan.getEntry()->insert(Expanded->getDefiningRecipe(), Iterator);
   return Expanded;
 }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 63eacd3d75721..9396d46860ccf 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -367,6 +367,7 @@ class VPDef {
     VPWidenSelectSC,
     VPBlendSC,
     VPHistogramSC,
+    VPScalarIVPromotionRecipeSC,
     // START: Phi-like recipes. Need to be kept together.
     VPWidenPHISC,
     VPPredInstPHISC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 34754a1ea3992..6082e5f305898 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -157,6 +157,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
         .Case<VPWidenIntrinsicRecipe>([&](const VPWidenIntrinsicRecipe *S) {
           return VerifyEVLUse(*S, S->getNumOperands() - 1);
         })
+        .Case<VPScalarIVPromotionRecipe>(
+            [&](const VPScalarIVPromotionRecipe *S) {
+              return VerifyEVLUse(*S, S->getNumOperands() - 1);
+            })
         .Case<VPWidenStoreEVLRecipe, VPReductionEVLRecipe,
               VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(
             [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); })
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll b/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll
index c0b044aef0d62..3d6aac7da5c0f 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll
@@ -25,7 +25,7 @@ loop:
   store i32 %l, ptr %gep
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -55,7 +55,7 @@ loop:
   store i32 %l, ptr %a
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -90,7 +90,7 @@ loop:
   store i32 %l, ptr %gep
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -125,7 +125,7 @@ loop:
   store i32 %l, ptr %a
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -156,7 +156,7 @@ loop:
   store i8 %t, ptr %gep
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -192,7 +192,7 @@ loop:
   store i32 %t, ptr %gep
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -222,7 +222,7 @@ loop:
   store i32 %l, ptr %gep
   %iv.next = sub i32 %iv, 1
   %ec = icmp eq i32 %iv.next, -100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -252,7 +252,7 @@ loop:
   store i32 %l, ptr %a
   %iv.next = sub i32 %iv, 1
   %ec = icmp eq i32 %iv.next, -100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -288,7 +288,7 @@ loop:
   store i32 %l, ptr %gep
   %iv.next = sub i32 %iv, 1
   %ec = icmp eq i32 %iv.next, -100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -323,7 +323,7 @@ loop:
   store i32 %l, ptr %a
   %iv.next = sub i32 %iv, 1
   %ec = icmp eq i32 %iv.next, -100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -352,7 +352,7 @@ loop:
   store i32 %l, ptr %gep.off
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -381,7 +381,7 @@ loop:
   store i32 %l, ptr %a
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -415,7 +415,7 @@ loop:
   store i32 %l, ptr %gep.off
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -449,7 +449,7 @@ loop:
   store i32 %l, ptr %a
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -485,7 +485,7 @@ loop:
   store i32 %l, ptr %gep
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -521,7 +521,7 @@ loop:
   store i32 %l, ptr %a
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -556,7 +556,7 @@ loop:
   store i32 %l, ptr %gep
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -592,7 +592,7 @@ loop:
   store i32 %l, ptr %a
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -623,7 +623,7 @@ loop:
   store i32 %l, ptr %a
   %iv.next = add i32 %iv, %off
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -654,7 +654,7 @@ loop:
   store i32 %l, ptr %gep
   %iv.next = add i32 %iv, %off
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -684,7 +684,7 @@ loop:
   store i32 %l, ptr %gep
   %iv.next = add i32 %iv, %off
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -714,7 +714,7 @@ loop:
   store i32 %l, ptr %a
   %iv.next = add i32 %iv, %off
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -744,7 +744,7 @@ loop:
   store i32 0, ptr %gep
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -779,7 +779,7 @@ loop:
   store i32 0, ptr %gep
   %iv.next = add i32 %iv, 1
   %ec = icmp eq i32 %iv.next, 100
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
@@ -814,8 +814,11 @@ loop:
   %iv.2.next = add i32 %iv.2, 1
   %iv.3.next = add i32 %iv.3, 1
   %ec = icmp eq i32 %iv.3, 200
-  br i1 %ec, label %exit, label %loop
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
 
 exit:
   ret void
 }
+
+!0 = distinct !{!0, !4}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index cb4bd793013b1..fc72d16099fcb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1183,10 +1183,10 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
 ; DEFAULT-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
 ; DEFAULT-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
 ; DEFAULT-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C]] to i64
-; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 8)
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
 ; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; DEFAULT:       [[VECTOR_MEMCHECK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index ab593f6f8bb6b..3b1f8101ff0c5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -2176,9 +2176,9 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; CHECK-MAXBW:       for.body.preheader:
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 46ec858d7455c..3aa2ef89e7791 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -940,9 +940,9 @@ define i32 @add_of_zext_outside_loop(i32 %a, ptr noalias %b, i8 %c, i32 %d) #0 {
 ; CHECK-MAXBW-SAME: i32 [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = sub i32 1024, [[D]]
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[TMP1]], 4
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = sub i32 1024, [[D]]
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], [[TMP2]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
@@ -1061,9 +1061,9 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-MAXBW-LABEL: define i32 @add_of_loop_invariant_zext(
 ; CHECK-MAXBW-SAME: i32 [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = sub i32 1024, [[D]]
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[TMP1]], 4
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = sub i32 1024, [[D]]
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], [[TMP2]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
index 44ae1757ce6e6..1a83f97a6aaa3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll
@@ -49,9 +49,9 @@ define i32 @chained_recurrences(i32 %x, i64 %y, ptr %src.1, i32 %z, ptr %src.2)
 ; VSCALEFORTUNING2-LABEL: define i32 @chained_recurrences(
 ; VSCALEFORTUNING2-SAME: i32 [[X:%.*]], i64 [[Y:%.*]], ptr [[SRC_1:%.*]], i32 [[Z:%.*]], ptr [[SRC_2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; VSCALEFORTUNING2-NEXT:  [[ENTRY:.*]]:
-; VSCALEFORTUNING2-NEXT:    [[TMP0:%.*]] = add i64 [[Y]], 1
 ; VSCALEFORTUNING2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; VSCALEFORTUNING2-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3
+; VSCALEFORTUNING2-NEXT:    [[TMP0:%.*]] = add i64 [[Y]], 1
 ; VSCALEFORTUNING2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; VSCALEFORTUNING2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; VSCALEFORTUNING2:       [[VECTOR_PH]]:
@@ -287,9 +287,9 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 {
 ; DEFAULT-LABEL: define i16 @reduce_udiv(
 ; DEFAULT-SAME: ptr [[SRC:%.*]], i16 [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; DEFAULT-NEXT:  [[ENTRY:.*]]:
-; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3
+; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; DEFAULT:       [[VECTOR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
index e338b828d2520..151c0ad058849 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll
@@ -11,11 +11,11 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0 {
 ; CHECK-LABEL: @test_no_scalarization(
 ; CHECK-NEXT:  L.entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[IDX:%.*]], 1
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[SMAX]], [[IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index d84463430179d..e0eca8abdd650 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -440,11 +440,11 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-UNORDERED-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
 ; CHECK-UNORDERED-NEXT:    [[A1:%.*]] = load float, ptr [[A]], align 4
 ; CHECK-UNORDERED-NEXT:    [[A2:%.*]] = load float, ptr [[ARRAYIDXA]], align 4
+; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
 ; CHECK-UNORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -2
 ; CHECK-UNORDERED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-UNORDERED-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
-; CHECK-UNORDERED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
 ; CHECK-UNORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-UNORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-UNORDERED:       vector.ph:
@@ -508,11 +508,11 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
 ; CHECK-ORDERED-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
 ; CHECK-ORDERED-NEXT:    [[A1:%.*]] = load float, ptr [[A]], align 4
 ; CHECK-ORDERED-NEXT:    [[A2:%.*]] = load float, ptr [[ARRAYIDXA]], align 4
+; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
 ; CHECK-ORDERED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -2
 ; CHECK-ORDERED-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-ORDERED-NEXT:    [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
-; CHECK-ORDERED-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
 ; CHECK-ORDERED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-ORDERED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-ORDERED:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index e90f8d09fc7ab..bbf07dcb62269 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -494,11 +494,11 @@ for.body:                                         ; preds = %for.body, %entry
 define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias nocapture %B, i64 %N) #1 {
 ; CHECK-LABEL: @even_load_dynamic_tc(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 2)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT_NOT:%.*]] = icmp samesign ult i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT_NOT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -789,9 +789,9 @@ for.body:                                         ; preds = %for.body, %entry
 define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-LABEL: @PR27626_0(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -860,9 +860,9 @@ for.end:
 define i32 @PR27626_1(ptr %p, i64 %n) #1 {
 ; CHECK-LABEL: @PR27626_1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -936,9 +936,9 @@ for.end:
 define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-LABEL: @PR27626_2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -1008,9 +1008,9 @@ for.end:
 define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-LABEL: @PR27626_3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -1090,12 +1090,12 @@ for.end:
 define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-LABEL: @PR27626_4(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 2)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -1166,12 +1166,12 @@ for.end:
 define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-LABEL: @PR27626_5(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -4
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -1248,10 +1248,10 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-LABEL: @PR34743(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTPRE:%.*]] = load i16, ptr [[A:%.*]], align 2
-; CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
index 9312306ce519a..438c1bcaa9e18 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
@@ -6,12 +6,12 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[START_22:%.*]] = ptrtoint ptr [[START_2:%.*]] to i64
 ; CHECK-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[END1]], -8
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[TMP0]], [[START_22]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
index ed49dc5a7573f..e2795ba1413cf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll
@@ -13,10 +13,10 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
-; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[N:%.*]], i32 999)
-; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i32 [[UMIN]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 3
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[N:%.*]], i32 999)
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i32 [[UMIN]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
@@ -83,10 +83,10 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
 ; CHECK-NEXT:    [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
-; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[N:%.*]], i32 999)
-; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i32 [[UMIN]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 3
+; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[N:%.*]], i32 999)
+; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i32 [[UMIN]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i32 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
index 4c7f70ad4d15e..bb65505e132ef 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
@@ -12,10 +12,10 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    [[SRC_13:%.*]] = ptrtoint ptr [[SRC_1:%.*]] to i64
 ; CHECK-NEXT:    [[DST_12:%.*]] = ptrtoint ptr [[DST_1:%.*]] to i64
 ; CHECK-NEXT:    [[DST_21:%.*]] = ptrtoint ptr [[DST_2:%.*]] to i64
-; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 20)
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
index f223786a07cdf..19453f2985e00 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@@ -98,9 +98,9 @@ exit:                            ; preds = %loop.body
 define void @pointer_induction(ptr noalias %start, i64 %N) {
 ; CHECK-LABEL: @pointer_induction(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 3b2b0b5c33aa9..b619710cbab96 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -217,9 +217,9 @@ for.cond.cleanup:                                 ; preds = %for.body
 define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-LABEL: @pointer_iv_mixed(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
index de70da6d2558b..e3f5daa029ce3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -12,9 +12,9 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features" = "+sve" {
 ; NONE-LABEL: @simple_memset_tailfold(
 ; NONE-NEXT:  entry:
-; NONE-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
 ; NONE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; NONE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; NONE-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
 ; NONE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP1]]
 ; NONE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NONE:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
index b63e03dccdc18..9f0cb02ab6b26 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
@@ -173,11 +173,11 @@ define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst)
 ; CHECK-LABEL: define void @test_masked_interleave_group(
 ; CHECK-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8)
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
index b5662b0bd8d3b..2705d972052c0 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll
@@ -630,11 +630,11 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) {
 ;
 ; NO-VP-LABEL: @load_factor_4_reverse(
 ; NO-VP-NEXT:  entry:
+; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; NO-VP-NEXT:    [[TMP0:%.*]] = add nsw i64 [[N:%.*]], -1
 ; NO-VP-NEXT:    [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[TMP0]], i64 0)
 ; NO-VP-NEXT:    [[TMP1:%.*]] = sub i64 [[N]], [[SMIN]]
-; NO-VP-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NO-VP-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; NO-VP-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
 ; NO-VP-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; NO-VP:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
index 63f9a1310d15a..7ef5cf0bf3bb0 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll
@@ -238,9 +238,24 @@ loopexit:
 define void @uniform_rw(ptr align(4) %addr) {
 ; CHECK-LABEL: @uniform_rw(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADDR:%.*]], align 4
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ [[TMP0]], [[FOR_BODY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2]] = add i32 [[TMP1]], 16
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH:%.*]]
+; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 4096, [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ADDR:%.*]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LOAD]], 1
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[ADDR]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
index 4068498dc68db..f377aa9db667e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-interleaved-accesses-gap.ll
@@ -7,65 +7,20 @@ target triple = "x86_64-apple-macosx13.0.0"
 define void @test_pr59090(ptr %l_out, ptr noalias %b) #0 {
 ; CHECK-LABEL: @test_pr59090(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[B:%.*]], align 1, !llvm.access.group [[ACC_GRP0:![0-9]+]]
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE14:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i8 [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], splat (i64 10000)
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[INDEX]], 6
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B:%.*]], align 1, !llvm.access.group [[ACC_GRP0:![0-9]+]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
-; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[TMP1]], i32 1
-; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
-; CHECK:       pred.store.if1:
-; CHECK-NEXT:    store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
-; CHECK:       pred.store.continue2:
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP1]], i32 2
-; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
-; CHECK:       pred.store.if3:
-; CHECK-NEXT:    store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
-; CHECK:       pred.store.continue4:
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i1> [[TMP1]], i32 3
-; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
-; CHECK:       pred.store.if5:
-; CHECK-NEXT:    store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
-; CHECK:       pred.store.continue6:
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i1> [[TMP1]], i32 4
-; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; CHECK:       pred.store.if7:
-; CHECK-NEXT:    store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP1]], i32 5
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; CHECK:       pred.store.if9:
-; CHECK-NEXT:    store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP1]], i32 6
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
-; CHECK:       pred.store.if11:
-; CHECK-NEXT:    store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP1]], i32 7
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14]]
-; CHECK:       pred.store.if13:
-; CHECK-NEXT:    store i8 [[TMP3]], ptr [[B]], align 1, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE14]]
-; CHECK:       pred.store.continue14:
+; CHECK-NEXT:    [[TMP4]] = add i8 [[TMP3]], 0
+; CHECK-NEXT:    store i8 [[TMP4]], ptr [[B]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[L_OUT:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <48 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:    [[TMP15:%.*]] = and <48 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false>
diff --git a/llvm/test/Transforms/LoopVectorize/memory-iv-promotion.ll b/llvm/test/Transforms/LoopVectorize/memory-iv-promotion.ll
index 192f2a55959e9..465682a1e0873 100644
--- a/llvm/test/Transforms/LoopVectorize/memory-iv-promotion.ll
+++ b/llvm/test/Transforms/LoopVectorize/memory-iv-promotion.ll
@@ -7,62 +7,28 @@
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
 
-; This test introduces a case where a memory induction variable
-; stops vectorization. This test will be updated when hoisting loads
-; for this memory induction variable and vectorization is possible.
 define void @test_copy_loop(ptr %theFirst, ptr %theLast, ptr %dest_base, ptr %m_size_ptr) {
 ; AARCH64-LABEL: define void @test_copy_loop(
 ; AARCH64-SAME: ptr [[THEFIRST:%.*]], ptr [[THELAST:%.*]], ptr [[DEST_BASE:%.*]], ptr [[M_SIZE_PTR:%.*]]) {
-; AARCH64-NEXT:  [[ENTRY:.*:]]
-; AARCH64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[M_SIZE_PTR]], align 8
-; AARCH64-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds nuw i16, ptr [[DEST_BASE]], i64 [[TMP0]]
-; AARCH64-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[THEFIRST]], [[THELAST]]
-; AARCH64-NEXT:    br i1 [[CMP_NOT]], label %[[CLEANUP:.*]], label %[[WHILE_BODY_PREHEADER:.*]]
-; AARCH64:       [[WHILE_BODY_PREHEADER]]:
-; AARCH64-NEXT:    br label %[[WHILE_BODY:.*]]
-; AARCH64:       [[WHILE_BODY]]:
-; AARCH64-NEXT:    [[THEFIRST_ADDR_0112:%.*]] = phi ptr [ [[INCDEC_PTR9:%.*]], %[[WHILE_BODY]] ], [ [[THEFIRST]], %[[WHILE_BODY_PREHEADER]] ]
-; AARCH64-NEXT:    [[THEPOINTER_0111:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], %[[WHILE_BODY]] ], [ [[ADD_PTR_I]], %[[WHILE_BODY_PREHEADER]] ]
-; AARCH64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[THEFIRST_ADDR_0112]], align 2
-; AARCH64-NEXT:    store i16 [[TMP1]], ptr [[THEPOINTER_0111]], align 2
-; AARCH64-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[THEPOINTER_0111]], i64 2
-; AARCH64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[M_SIZE_PTR]], align 8
-; AARCH64-NEXT:    [[INC:%.*]] = add i64 [[TMP2]], 1
-; AARCH64-NEXT:    store i64 [[INC]], ptr [[M_SIZE_PTR]], align 8
-; AARCH64-NEXT:    [[INCDEC_PTR9]] = getelementptr inbounds nuw i8, ptr [[THEFIRST_ADDR_0112]], i64 2
-; AARCH64-NEXT:    [[CMP7_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR9]], [[THELAST]]
-; AARCH64-NEXT:    br i1 [[CMP7_NOT]], label %[[CLEANUP_LOOPEXIT:.*]], label %[[WHILE_BODY]]
-; AARCH64:       [[CLEANUP_LOOPEXIT]]:
-; AARCH64-NEXT:    br label %[[CLEANUP]]
-; AARCH64:       [[CLEANUP]]:
-; AARCH64-NEXT:    ret void
+; AARCH64:       vector.body:
+; AARCH64:         [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; AARCH64:         [[WIDE_LOAD0:%.*]] = load <8 x i16>, ptr
+; AARCH64:         [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr
+; AARCH64:         store <8 x i16> [[WIDE_LOAD0]], ptr
+; AARCH64:         store <8 x i16> [[WIDE_LOAD1]], ptr
+; AARCH64:         [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; AARCH64:         br i1 {{.*}}, label %middle.block, label %vector.body
 ;
 ; X86_64-LABEL: define void @test_copy_loop(
 ; X86_64-SAME: ptr [[THEFIRST:%.*]], ptr [[THELAST:%.*]], ptr [[DEST_BASE:%.*]], ptr [[M_SIZE_PTR:%.*]]) {
-; X86_64-NEXT:  [[ENTRY:.*:]]
-; X86_64-NEXT:    [[TMP0:%.*]] = load i64, ptr [[M_SIZE_PTR]], align 8
-; X86_64-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds nuw i16, ptr [[DEST_BASE]], i64 [[TMP0]]
-; X86_64-NEXT:    [[CMP_NOT:%.*]] = icmp eq ptr [[THEFIRST]], [[THELAST]]
-; X86_64-NEXT:    br i1 [[CMP_NOT]], label %[[CLEANUP:.*]], label %[[WHILE_BODY_PREHEADER:.*]]
-; X86_64:       [[WHILE_BODY_PREHEADER]]:
-; X86_64-NEXT:    br label %[[WHILE_BODY:.*]]
-; X86_64:       [[WHILE_BODY]]:
-; X86_64-NEXT:    [[THEFIRST_ADDR_0112:%.*]] = phi ptr [ [[INCDEC_PTR9:%.*]], %[[WHILE_BODY]] ], [ [[THEFIRST]], %[[WHILE_BODY_PREHEADER]] ]
-; X86_64-NEXT:    [[THEPOINTER_0111:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], %[[WHILE_BODY]] ], [ [[ADD_PTR_I]], %[[WHILE_BODY_PREHEADER]] ]
-; X86_64-NEXT:    [[TMP1:%.*]] = load i16, ptr [[THEFIRST_ADDR_0112]], align 2
-; X86_64-NEXT:    store i16 [[TMP1]], ptr [[THEPOINTER_0111]], align 2
-; X86_64-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[THEPOINTER_0111]], i64 2
-; X86_64-NEXT:    [[TMP2:%.*]] = load i64, ptr [[M_SIZE_PTR]], align 8
-; X86_64-NEXT:    [[INC:%.*]] = add i64 [[TMP2]], 1
-; X86_64-NEXT:    store i64 [[INC]], ptr [[M_SIZE_PTR]], align 8
-; X86_64-NEXT:    [[INCDEC_PTR9]] = getelementptr inbounds nuw i8, ptr [[THEFIRST_ADDR_0112]], i64 2
-; X86_64-NEXT:    [[CMP7_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR9]], [[THELAST]]
-; X86_64-NEXT:    br i1 [[CMP7_NOT]], label %[[CLEANUP_LOOPEXIT:.*]], label %[[WHILE_BODY]]
-; X86_64:       [[CLEANUP_LOOPEXIT]]:
-; X86_64-NEXT:    br label %[[CLEANUP]]
-; X86_64:       [[CLEANUP]]:
-; X86_64-NEXT:    ret void
-;
+; X86_64:       vector.body:
+; X86_64:         [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; X86_64:         [[WIDE_LOAD0:%.*]] = load <2 x i16>, ptr
+; X86_64:         [[WIDE_LOAD1:%.*]] = load <2 x i16>, ptr
+; X86_64:         store <2 x i16> [[WIDE_LOAD0]], ptr
+; X86_64:         store <2 x i16> [[WIDE_LOAD1]], ptr
+; X86_64:         [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; X86_64:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 entry:
   %0 = load i64, ptr %m_size_ptr, align 8
diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
index 5c62ca3ff3d01..1138cb7dcf4c9 100644
--- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll
@@ -438,8 +438,8 @@ define i64 @ivopt_widen_ptr_indvar_1(ptr noalias %a, i64 %stride, i64 %n) {
 ;
 ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_1(
 ; STRIDED-NEXT:  entry:
-; STRIDED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; STRIDED-NEXT:    [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3
+; STRIDED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; STRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
 ; STRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; STRIDED:       vector.ph:
@@ -523,8 +523,8 @@ define i64 @ivopt_widen_ptr_indvar_2(ptr noalias %a, i64 %stride, i64 %n) {
 ;
 ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_2(
 ; STRIDED-NEXT:  entry:
-; STRIDED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; STRIDED-NEXT:    [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3
+; STRIDED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; STRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
 ; STRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; STRIDED:       vector.ph:
@@ -630,8 +630,8 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) {
 ;
 ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_3(
 ; STRIDED-NEXT:  entry:
-; STRIDED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; STRIDED-NEXT:    [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3
+; STRIDED-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
 ; STRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
 ; STRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; STRIDED:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
index faca86a41b023..4445b0ea79209 100644
--- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
+++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll
@@ -206,10 +206,15 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog
 ; CHECK-NEXT:    br i1 [[INVAR_C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1]]
 ; CHECK:       [[LOOP_2_PREHEADER]]:
 ; CHECK-NEXT:    [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], %[[LOOP_1]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = sub i32 2, [[STEP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP16]]
+; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[INDVAR]], -1
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP3]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[IV_1_LCSSA]], [[STEP]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[STEP]], -2
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[INDVAR]], -1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[SMAX]], [[TMP4]]
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 1)
@@ -218,11 +223,6 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1)
 ; CHECK-NEXT:    [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]]
-; CHECK-NEXT:    [[TMP16:%.*]] = sub i32 2, [[STEP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP16]]
-; CHECK-NEXT:    [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0)
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP3]], -1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP15]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK:       [[VECTOR_SCEVCHECK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
index 1216bc1dc33cc..17d54c931645e 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll
@@ -16,11 +16,11 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF1-NEXT:    br label %[[FOR_PREHEADER:.*]]
 ; CHECK-VF4UF1:       [[FOR_PREHEADER]]:
 ; CHECK-VF4UF1-NEXT:    [[PRE_LOAD:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
 ; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
 ; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
 ; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_MEMCHECK]]:
@@ -79,11 +79,11 @@ define i32 @recurrence_1(ptr nocapture readonly %a, ptr nocapture %b, i32 %n) {
 ; CHECK-VF4UF2-NEXT:    br label %[[FOR_PREHEADER:.*]]
 ; CHECK-VF4UF2:       [[FOR_PREHEADER]]:
 ; CHECK-VF4UF2-NEXT:    [[PRE_LOAD:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 3
 ; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
 ; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 3
 ; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_MEMCHECK]]:
@@ -183,11 +183,11 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF1:       [[FOR_PREHEADER]]:
 ; CHECK-VF4UF1-NEXT:    [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
 ; CHECK-VF4UF1-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
 ; CHECK-VF4UF1-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -1
 ; CHECK-VF4UF1-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
 ; CHECK-VF4UF1-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1
-; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
 ; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_PH]]:
@@ -242,11 +242,11 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) {
 ; CHECK-VF4UF2:       [[FOR_PREHEADER]]:
 ; CHECK-VF4UF2-NEXT:    [[ARRAYIDX2_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 -1
 ; CHECK-VF4UF2-NEXT:    [[DOTPRE:%.*]] = load i32, ptr [[ARRAYIDX2_PHI_TRANS_INSERT]], align 4
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3
 ; CHECK-VF4UF2-NEXT:    [[TMP5:%.*]] = add i32 [[N]], -1
 ; CHECK-VF4UF2-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
 ; CHECK-VF4UF2-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[TMP6]], 1
-; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 3
 ; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_PH]]:
@@ -352,10 +352,10 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF1-NEXT:    [[CMP25:%.*]] = icmp sgt i32 [[N]], 1
 ; CHECK-VF4UF1-NEXT:    br i1 [[CMP25]], label %[[FOR_PREHEADER:.*]], [[FOR_END:label %.*]]
 ; CHECK-VF4UF1:       [[FOR_PREHEADER]]:
-; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
-; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-VF4UF1-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF1-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-VF4UF1-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF1-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-VF4UF1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-VF4UF1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK-VF4UF1:       [[VECTOR_MEMCHECK]]:
@@ -428,10 +428,10 @@ define void @recurrence_3(ptr nocapture readonly %a, ptr nocapture %b, i32 %n, f
 ; CHECK-VF4UF2-NEXT:    [[CMP25:%.*]] = icmp sgt i32 [[N]], 1
 ; CHECK-VF4UF2-NEXT:    br i1 [[CMP25]], label %[[FOR_PREHEADER:.*]], [[FOR_END:label %.*]]
 ; CHECK-VF4UF2:       [[FOR_PREHEADER]]:
-; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
-; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-VF4UF2-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-VF4UF2-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 3
+; CHECK-VF4UF2-NEXT:    [[TMP1:%.*]] = add i32 [[N]], -1
+; CHECK-VF4UF2-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-VF4UF2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-VF4UF2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK-VF4UF2:       [[VECTOR_MEMCHECK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
index 0ba7789ffba94..be0e74e99e698 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll
@@ -443,10 +443,10 @@ define void @print_expand_scev(i64 %y, ptr %ptr) {
 ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT:   EMIT vp<[[EXP_SCEV:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060))<nuw><nsw>
+; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060))<nuw><nsw> /u (1 + (%y /u 492802768830814060))<nuw><nsw>))<nuw><nsw>
 ; CHECK-NEXT:   IR %div = udiv i64 %y, 492802768830814060
 ; CHECK-NEXT:   IR %inc = add i64 %div, 1
-; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060))<nuw><nsw> /u (1 + (%y /u 492802768830814060))<nuw><nsw>))<nuw><nsw>
-; CHECK-NEXT:   EMIT vp<[[EXP_SCEV:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060))<nuw><nsw>
 ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph
 ; CHECK-EMPTY:
 ; CHECK-NEXT: vector.ph: