[llvm] [LV] Compute register usage for interleaving on VPlan. (PR #126437)

Sun Feb 9 13:26:43 PST 2025

https://github.com/fhahn created https://github.com/llvm/llvm-project/pull/126437

Add a version of calculateRegisterUsage that works estimates register
usage for a VPlan. This mostly just ports the existing code, with some
updates to figure out what recipes will generate vectors vs scalars.

There are number of changes in the computed register usages, but they
should be more accurate w.r.t. to the generated vector code.

There are the following changes:

 * Scalar usage increases in most cases by 1, as we always create a
   scalar canonical IV, which is alive across the loop and is not
   considered by the legacy implementation

 * Output is ordered by insertion, now scalar registers are added first
   due the canonical IV phi.

 * Using the VPlan, we now also more precisely know if an induction will
   be vectorized or scalarized.

Depends on https://github.com/llvm/llvm-project/pull/126415

>From 0d9a09275bd7df0f975885ec5243875601598dc1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Sun, 9 Feb 2025 10:02:09 +0000
Subject: [PATCH 1/2] [LV] Don't skip instrs with side-effects in reg pressure
 computation.

calculateRegisterUsage adds end points for each user of an instruction to
Ends and ignores instructions not added to it, i.e. instructions with no
users.

This means things like stores aren't included, which in turn means
values that are only used in stores are also not included for
consideration. This means we underestimate the register usage in cases
where the only users are things like stores.

Update the code to don't skip instructions without users (i.e. not in
Ends) if they have side-effects.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp        |  2 +-
 .../test/Transforms/LoopVectorize/AArch64/reg-usage.ll |  3 ++-
 .../Transforms/LoopVectorize/LoongArch/reg-usage.ll    |  3 ++-
 .../test/Transforms/LoopVectorize/PowerPC/reg-usage.ll |  4 ++--
 .../LoopVectorize/RISCV/riscv-vector-reverse.ll        | 10 ++++++----
 5 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c2d347cf9b7e0c9..6cc24e371f965ae 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5262,7 +5262,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
       OpenIntervals.erase(ToRemove);
 
     // Ignore instructions that are never used within the loop.
-    if (!Ends.count(I))
+    if (!Ends.count(I) && !I->mayHaveSideEffects())
       continue;
 
     // Skip ignored values.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
index 8239d32445c1054..961ef64acd4a46c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -14,8 +14,9 @@
 define void @get_invariant_reg_usage(ptr %z) {
 ; CHECK: LV: Checking a loop in 'get_invariant_reg_usage'
 ; CHECK: LV(REG): VF = vscale x 16
-; CHECK-NEXT: LV(REG): Found max usage: 1 item
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 1 registers 
 ; CHECK-NEXT: LV(REG): Found invariant usage: 2 item
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 8 registers 
diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll
index f45a2f0f5b7e8f3..021ef0d543a18ef 100644
--- a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll
@@ -15,8 +15,9 @@ define void @bar(ptr %A, i32 signext %n) {
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
 ; CHECK-SCALAR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class
 ; CHECK-SCALAR-NEXT: LV: The target has 32 registers of LoongArch::FPRRC register class
-; CHECK-VECTOR:      LV(REG): Found max usage: 1 item
+; CHECK-VECTOR:      LV(REG): Found max usage: 2 item
 ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::VRRC, 3 registers
+; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
 ; CHECK-VECTOR-NEXT: LV(REG): Found invariant usage: 1 item
 ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
 ; CHECK-VECTOR-NEXT: LV: The target has 32 registers of LoongArch::VRRC register class
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
index aac9aff3d391f5d..db4b580a3967718 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
@@ -179,7 +179,7 @@ define void @double_(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
 
 ;CHECK-PWR9: LV(REG): VF = 1
 ;CHECK-PWR9: LV(REG): Found max usage: 2 item
-;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
+;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 3 registers
 ;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 5 registers
 ;CHECK-PWR9: LV(REG): Found invariant usage: 1 item
 ;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers
@@ -248,7 +248,7 @@ define void @fp16_(ptr nocapture readonly %pIn, ptr nocapture %pOut, i32 %numRow
 ;CHECK-LABEL: fp16_
 ;CHECK: LV(REG): VF = 1
 ;CHECK: LV(REG): Found max usage: 2 item
-;CHECK: LV(REG): RegisterClass: PPC::GPRRC, 4 registers
+;CHECK: LV(REG): RegisterClass: PPC::GPRRC, 5 registers
 ;CHECK: LV(REG): RegisterClass: PPC::VSXRC, 2 registers
 entry:
   %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index f630f4f21e065ff..4e563c242ee848c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -128,8 +128,9 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): At #5 Interval # 3
 ; CHECK-NEXT:  LV(REG): At #6 Interval # 3
 ; CHECK-NEXT:  LV(REG): At #7 Interval # 3
-; CHECK-NEXT:  LV(REG): At #9 Interval # 1
-; CHECK-NEXT:  LV(REG): At #10 Interval # 2
+; CHECK-NEXT:  LV(REG): At #8 Interval # 3
+; CHECK-NEXT:  LV(REG): At #9 Interval # 2
+; CHECK-NEXT:  LV(REG): At #10 Interval # 3
 ; CHECK-NEXT:  LV(REG): VF = vscale x 4
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
@@ -377,8 +378,9 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): At #5 Interval # 3
 ; CHECK-NEXT:  LV(REG): At #6 Interval # 3
 ; CHECK-NEXT:  LV(REG): At #7 Interval # 3
-; CHECK-NEXT:  LV(REG): At #9 Interval # 1
-; CHECK-NEXT:  LV(REG): At #10 Interval # 2
+; CHECK-NEXT:  LV(REG): At #8 Interval # 3
+; CHECK-NEXT:  LV(REG): At #9 Interval # 2
+; CHECK-NEXT:  LV(REG): At #10 Interval # 3
 ; CHECK-NEXT:  LV(REG): VF = vscale x 4
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers

>From 72e54093cfbb31bcdd84e49129a9fdd1e1f815f4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 7 Feb 2025 20:47:09 +0000
Subject: [PATCH 2/2] [LV] Compute register usage for interleaving on VPlan.

Add a version of calculateRegisterUsage that works estimates register
usage for a VPlan. This mostly just ports the existing code, with some
updates to figure out what recipes will generate vectors vs scalars.

There are number of changes in the computed register usages, but they
should be more accurate w.r.t. to the generated vector code.

There are the following changes:

 * Scalar usage increases in most cases by 1, as we always create a
   scalar canonical IV, which is alive across the loop and is not
   considered by the legacy implementation

 * Output is ordered by insertion, now scalar registers are added first
   due the canonical IV phi.

 * Using the VPlan, we now also more precisely know if an induction will
   be vectorized or scalarized.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 233 +++++++++++++++++-
 .../LoopVectorize/AArch64/i1-reg-usage.ll     |   4 +-
 .../LoopVectorize/AArch64/reg-usage.ll        |   7 +-
 .../LoopVectorize/LoongArch/reg-usage.ll      |   7 +-
 .../LoopVectorize/PowerPC/exit-branch-cost.ll |  60 ++++-
 .../LoopVectorize/PowerPC/large-loop-rdx.ll   |   4 -
 .../LoopVectorize/PowerPC/reg-usage.ll        |  14 +-
 .../LoopVectorize/RISCV/reg-usage-bf16.ll     |   2 +-
 .../LoopVectorize/RISCV/reg-usage-f16.ll      |   4 +-
 .../LoopVectorize/RISCV/reg-usage.ll          |  10 +-
 .../RISCV/riscv-vector-reverse.ll             |  14 +-
 .../LoopVectorize/X86/i1-reg-usage.ll         |   2 +-
 .../Transforms/LoopVectorize/X86/pr47437.ll   | 120 ++-------
 .../Transforms/LoopVectorize/X86/reg-usage.ll |   4 +-
 14 files changed, 338 insertions(+), 147 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6cc24e371f965ae..9727487a1dd1396 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1020,7 +1020,8 @@ class LoopVectorizationCostModel {
   /// If interleave count has been specified by metadata it will be returned.
   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
   /// are the selected vectorization factor and the cost of the selected VF.
-  unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
+  unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
+                                 InstructionCost LoopCost);
 
   /// Memory access instruction may be vectorized in more than one way.
   /// Form of instruction after vectorization depends on cost.
@@ -4878,8 +4879,232 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
   }
 }
 
+/// Estimate the register usage for \p Plan and vectorization factors in \p VFs.
+/// Returns the register usage for each VF in \p VFs.
+static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
+calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
+                       const TargetTransformInfo &TTI) {
+  // This function calculates the register usage by measuring the highest number
+  // of values that are alive at a single location. Obviously, this is a very
+  // rough estimation. We scan the loop in a topological order in order and
+  // assign a number to each recipe. We use RPO to ensure that defs are
+  // met before their users. We assume that each recipe that has in-loop
+  // users starts an interval. We record every time that an in-loop value is
+  // used, so we have a list of the first and last occurrences of each
+  // recipe. Next, we transpose this data structure into a multi map that
+  // holds the list of intervals that *end* at a specific location. This multi
+  // map allows us to perform a linear search. We scan the instructions linearly
+  // and record each time that a new interval starts, by placing it in a set.
+  // If we find this value in the multi-map then we remove it from the set.
+  // The max register usage is the maximum size of the set.
+  // We also search for instructions that are defined outside the loop, but are
+  // used inside the loop. We need this number separately from the max-interval
+  // usage number because when we unroll, loop-invariant values do not take
+  // more register.
+  LoopVectorizationCostModel::RegisterUsage RU;
+
+  // Each 'key' in the map opens a new interval. The values
+  // of the map are the index of the 'last seen' usage of the
+  // recipe that is the key.
+  using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned, 16>;
+
+  // Maps recipe to its index.
+  SmallVector<VPRecipeBase *, 64> IdxToRecipe;
+  // Marks the end of each interval.
+  IntervalMap EndPoint;
+  // Saves the list of recipe indices that are used in the loop.
+  SmallPtrSet<VPRecipeBase *, 8> Ends;
+  // Saves the list of values that are used in the loop but are defined outside
+  // the loop (not including non-recipe values such as arguments and
+  // constants).
+  SmallSetVector<VPValue *, 8> LoopInvariants;
+  LoopInvariants.insert(&Plan.getVectorTripCount());
+
+  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
+      Plan.getVectorLoopRegion());
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+    if (!VPBB->getParent())
+      break;
+    for (VPRecipeBase &R : *VPBB) {
+      IdxToRecipe.push_back(&R);
+
+      // Save the end location of each USE.
+      for (VPValue *U : R.operands()) {
+        auto *DefR = U->getDefiningRecipe();
+
+        // Ignore non-recipe values such as arguments, constants, etc.
+        // FIXME: Might need some motivation why these values are ignored. If
+        // for example an argument is used inside the loop it will increase the
+        // register pressure (so shouldn't we add it to LoopInvariants).
+        if (!DefR && (!U->getLiveInIRValue() ||
+                      !isa<Instruction>(U->getLiveInIRValue())))
+          continue;
+
+        // If this recipe is outside the loop then record it and continue.
+        if (!DefR) {
+          LoopInvariants.insert(U);
+          continue;
+        }
+
+        // Overwrite previous end points.
+        EndPoint[DefR] = IdxToRecipe.size();
+        Ends.insert(DefR);
+      }
+    }
+    if (VPBB == Plan.getVectorLoopRegion()->getExiting()) {
+      // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
+      // exiting block, where their increment will get materialized eventually.
+      for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+        if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
+          EndPoint[&R] = IdxToRecipe.size();
+          Ends.insert(&R);
+        }
+      }
+    }
+  }
+
+  // Saves the list of intervals that end with the index in 'key'.
+  using RecipeList = SmallVector<VPRecipeBase *, 2>;
+  SmallDenseMap<unsigned, RecipeList, 16> TransposeEnds;
+
+  // Transpose the EndPoints to a list of values that end at each index.
+  for (auto &Interval : EndPoint)
+    TransposeEnds[Interval.second].push_back(Interval.first);
+
+  SmallPtrSet<VPRecipeBase *, 8> OpenIntervals;
+  SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> RUs(VFs.size());
+  SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
+
+  LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
+
+  const auto &TTICapture = TTI;
+  auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
+    if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
+        (VF.isScalable() &&
+         !TTICapture.isElementTypeLegalForScalableVector(Ty)))
+      return 0;
+    return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
+  };
+
+  for (unsigned int Idx = 0, Sz = IdxToRecipe.size(); Idx < Sz; ++Idx) {
+    VPRecipeBase *R = IdxToRecipe[Idx];
+
+    //  Remove all of the recipes that end at this location.
+    RecipeList &List = TransposeEnds[Idx];
+    for (VPRecipeBase *ToRemove : List)
+      OpenIntervals.erase(ToRemove);
+
+    // Ignore recipes that are never used within the loop.
+    if (!Ends.count(R) && !R->mayHaveSideEffects())
+      continue;
+
+    // For each VF find the maximum usage of registers.
+    for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
+      // Count the number of registers used, per register class, given all open
+      // intervals.
+      // Note that elements in this SmallMapVector will be default constructed
+      // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
+      // there is no previous entry for ClassID.
+      SmallMapVector<unsigned, unsigned, 4> RegUsage;
+
+      if (VFs[J].isScalar()) {
+        for (auto *Inst : OpenIntervals) {
+          for (VPValue *DefV : Inst->definedValues()) {
+            unsigned ClassID = TTI.getRegisterClassForType(
+                false, TypeInfo.inferScalarType(DefV));
+            // FIXME: The target might use more than one register for the type
+            // even in the scalar case.
+            RegUsage[ClassID] += 1;
+          }
+        }
+      } else {
+        for (auto *R : OpenIntervals) {
+          if (isa<VPVectorPointerRecipe, VPReverseVectorPointerRecipe>(R))
+            continue;
+          if (isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
+                  VPScalarIVStepsRecipe>(R) ||
+              (isa<VPInstruction>(R) &&
+               all_of(cast<VPSingleDefRecipe>(R)->users(), [&](VPUser *U) {
+                 return cast<VPRecipeBase>(U)->usesScalars(
+                     R->getVPSingleValue());
+               }))) {
+            unsigned ClassID = TTI.getRegisterClassForType(
+                false, TypeInfo.inferScalarType(R->getVPSingleValue()));
+            // FIXME: The target might use more than one register for the type
+            // even in the scalar case.
+            RegUsage[ClassID] += 1;
+          } else {
+            for (VPValue *DefV : R->definedValues()) {
+              Type *ScalarTy = TypeInfo.inferScalarType(DefV);
+              unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
+              RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]);
+            }
+          }
+        }
+      }
+
+      for (const auto &Pair : RegUsage) {
+        auto &Entry = MaxUsages[J][Pair.first];
+        Entry = std::max(Entry, Pair.second);
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
+                      << OpenIntervals.size() << '\n');
+
+    // Add the current recipe to the list of open intervals.
+    OpenIntervals.insert(R);
+  }
+
+  for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
+    // Note that elements in this SmallMapVector will be default constructed
+    // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
+    // there is no previous entry for ClassID.
+    SmallMapVector<unsigned, unsigned, 4> Invariant;
+
+    for (auto *In : LoopInvariants) {
+      // FIXME: The target might use more than one register for the type
+      // even in the scalar case.
+      bool IsScalar = all_of(In->users(), [&](VPUser *U) {
+        return cast<VPRecipeBase>(U)->usesScalars(In);
+      });
+
+      ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
+      unsigned ClassID = TTI.getRegisterClassForType(
+          VF.isVector(), TypeInfo.inferScalarType(In));
+      Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF);
+    }
+
+    LLVM_DEBUG({
+      dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
+      dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
+             << " item\n";
+      for (const auto &pair : MaxUsages[Idx]) {
+        dbgs() << "LV(REG): RegisterClass: "
+               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+               << " registers\n";
+      }
+      dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
+             << " item\n";
+      for (const auto &pair : Invariant) {
+        dbgs() << "LV(REG): RegisterClass: "
+               << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+               << " registers\n";
+      }
+    });
+
+    RU.LoopInvariantRegs = Invariant;
+    RU.MaxLocalUsers = MaxUsages[Idx];
+    RUs[Idx] = RU;
+  }
+
+  return RUs;
+}
+
 unsigned
-LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
+LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
                                                   InstructionCost LoopCost) {
   // -- The interleave heuristics --
   // We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4929,7 +5154,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
       return 1;
   }
 
-  RegisterUsage R = calculateRegisterUsage({VF})[0];
+  RegisterUsage R = ::calculateRegisterUsage(Plan, {VF}, TTI)[0];
   // We divide by these constants so assume that we have at least one
   // instruction that uses at least one register.
   for (auto &Pair : R.MaxLocalUsers) {
@@ -10574,7 +10799,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                            AddBranchWeights, CM.CostKind);
   if (LVP.hasPlanWithVF(VF.Width)) {
     // Select the interleave count.
-    IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
+    IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
 
     unsigned SelectedIC = std::max(IC, UserIC);
     //  Optimistically generate runtime checks if they are needed. Drop them if
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
index 69af51deea08ea9..0ec90b75002cd0f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll
@@ -8,8 +8,8 @@ target triple = "aarch64"
 ; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin>
 ; CHECK: LV(REG): VF = 32
 ; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers
-; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
 
 define i1 @or_reduction_neon(i32 %arg, ptr %ptr) {
 entry:
@@ -31,8 +31,8 @@ loop:
 ; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve'
 ; CHECK: LV(REG): VF = 64
 ; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
-; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
 
 define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
index 961ef64acd4a46c..feb0f03caa5033c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -15,11 +15,10 @@ define void @get_invariant_reg_usage(ptr %z) {
 ; CHECK: LV: Checking a loop in 'get_invariant_reg_usage'
 ; CHECK: LV(REG): VF = vscale x 16
 ; CHECK-NEXT: LV(REG): Found max usage: 2 item
-; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers
-; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 1 registers 
-; CHECK-NEXT: LV(REG): Found invariant usage: 2 item
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
-; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 8 registers 
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 1 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 3 registers
 
 L.entry:
   %0 = load i128, ptr %z, align 16
diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll
index 021ef0d543a18ef..5baf1e013a50f87 100644
--- a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll
@@ -9,17 +9,18 @@
 define void @bar(ptr %A, i32 signext %n) {
 ; CHECK-LABEL: bar
 ; CHECK-SCALAR:      LV(REG): Found max usage: 2 item
-; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 2 registers
+; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 3 registers
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::FPRRC, 1 registers
 ; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
 ; CHECK-SCALAR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class
 ; CHECK-SCALAR-NEXT: LV: The target has 32 registers of LoongArch::FPRRC register class
 ; CHECK-VECTOR:      LV(REG): Found max usage: 2 item
-; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::VRRC, 3 registers
-; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
+; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 2 registers
+; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::VRRC, 2 registers
 ; CHECK-VECTOR-NEXT: LV(REG): Found invariant usage: 1 item
 ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
+; CHECK-VECTOR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class
 ; CHECK-VECTOR-NEXT: LV: The target has 32 registers of LoongArch::VRRC register class
 
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
index e5717c4f1d91ab8..cc1e5f9a68593cc 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
@@ -18,10 +18,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP2]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
 ; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP2]], 24
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 24
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
@@ -35,6 +35,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI15:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP48:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP49:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI17:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP50:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI18:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP64:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI19:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI20:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI21:%.*]] = phi <2 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP67:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = add <2 x i64> [[STEP_ADD]], splat (i64 2)
 ; CHECK-NEXT:    [[STEP_ADD_3:%.*]] = add <2 x i64> [[STEP_ADD_2]], splat (i64 2)
@@ -42,6 +46,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[STEP_ADD_5:%.*]] = add <2 x i64> [[STEP_ADD_4]], splat (i64 2)
 ; CHECK-NEXT:    [[STEP_ADD_6:%.*]] = add <2 x i64> [[STEP_ADD_5]], splat (i64 2)
 ; CHECK-NEXT:    [[STEP_ADD_7:%.*]] = add <2 x i64> [[STEP_ADD_6]], splat (i64 2)
+; CHECK-NEXT:    [[STEP_ADD_8:%.*]] = add <2 x i64> [[STEP_ADD_7]], splat (i64 2)
+; CHECK-NEXT:    [[STEP_ADD_9:%.*]] = add <2 x i64> [[STEP_ADD_8]], splat (i64 2)
+; CHECK-NEXT:    [[STEP_ADD_10:%.*]] = add <2 x i64> [[STEP_ADD_9]], splat (i64 2)
+; CHECK-NEXT:    [[STEP_ADD_11:%.*]] = add <2 x i64> [[STEP_ADD_10]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
@@ -52,6 +60,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 10
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 12
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 14
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 18
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 20
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 22
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP11]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD25:%.*]] = load <2 x i8>, ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <2 x i8>, ptr [[TMP13]], align 1
@@ -60,6 +72,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i8>, ptr [[TMP16]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i8>, ptr [[TMP17]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i8>, ptr [[TMP18]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD22:%.*]] = load <2 x i8>, ptr [[TMP68]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD23:%.*]] = load <2 x i8>, ptr [[TMP69]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <2 x i8>, ptr [[TMP70]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i8>, ptr [[TMP71]], align 1
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <2 x i8> [[WIDE_LOAD]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP20:%.*]] = zext <2 x i8> [[WIDE_LOAD25]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext <2 x i8> [[WIDE_LOAD26]] to <2 x i64>
@@ -68,6 +84,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = zext <2 x i8> [[WIDE_LOAD29]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP25:%.*]] = zext <2 x i8> [[WIDE_LOAD30]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP26:%.*]] = zext <2 x i8> [[WIDE_LOAD31]] to <2 x i64>
+; CHECK-NEXT:    [[TMP72:%.*]] = zext <2 x i8> [[WIDE_LOAD22]] to <2 x i64>
+; CHECK-NEXT:    [[TMP73:%.*]] = zext <2 x i8> [[WIDE_LOAD23]] to <2 x i64>
+; CHECK-NEXT:    [[TMP74:%.*]] = zext <2 x i8> [[WIDE_LOAD24]] to <2 x i64>
+; CHECK-NEXT:    [[TMP75:%.*]] = zext <2 x i8> [[WIDE_LOAD33]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shl <2 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP28:%.*]] = shl <2 x i64> [[STEP_ADD]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP29:%.*]] = shl <2 x i64> [[STEP_ADD_2]], splat (i64 1)
@@ -76,6 +96,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[TMP32:%.*]] = shl <2 x i64> [[STEP_ADD_5]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP33:%.*]] = shl <2 x i64> [[STEP_ADD_6]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP34:%.*]] = shl <2 x i64> [[STEP_ADD_7]], splat (i64 1)
+; CHECK-NEXT:    [[TMP76:%.*]] = shl <2 x i64> [[STEP_ADD_8]], splat (i64 1)
+; CHECK-NEXT:    [[TMP77:%.*]] = shl <2 x i64> [[STEP_ADD_9]], splat (i64 1)
+; CHECK-NEXT:    [[TMP78:%.*]] = shl <2 x i64> [[STEP_ADD_10]], splat (i64 1)
+; CHECK-NEXT:    [[TMP79:%.*]] = shl <2 x i64> [[STEP_ADD_11]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP35:%.*]] = shl <2 x i64> [[TMP19]], [[TMP27]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = shl <2 x i64> [[TMP20]], [[TMP28]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = shl <2 x i64> [[TMP21]], [[TMP29]]
@@ -84,6 +108,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[TMP40:%.*]] = shl <2 x i64> [[TMP24]], [[TMP32]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = shl <2 x i64> [[TMP25]], [[TMP33]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = shl <2 x i64> [[TMP26]], [[TMP34]]
+; CHECK-NEXT:    [[TMP80:%.*]] = shl <2 x i64> [[TMP72]], [[TMP76]]
+; CHECK-NEXT:    [[TMP81:%.*]] = shl <2 x i64> [[TMP73]], [[TMP77]]
+; CHECK-NEXT:    [[TMP82:%.*]] = shl <2 x i64> [[TMP74]], [[TMP78]]
+; CHECK-NEXT:    [[TMP83:%.*]] = shl <2 x i64> [[TMP75]], [[TMP79]]
 ; CHECK-NEXT:    [[TMP43]] = or <2 x i64> [[TMP35]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP44]] = or <2 x i64> [[TMP36]], [[VEC_PHI11]]
 ; CHECK-NEXT:    [[TMP45]] = or <2 x i64> [[TMP37]], [[VEC_PHI12]]
@@ -92,8 +120,12 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[TMP48]] = or <2 x i64> [[TMP40]], [[VEC_PHI15]]
 ; CHECK-NEXT:    [[TMP49]] = or <2 x i64> [[TMP41]], [[VEC_PHI16]]
 ; CHECK-NEXT:    [[TMP50]] = or <2 x i64> [[TMP42]], [[VEC_PHI17]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_7]], splat (i64 2)
+; CHECK-NEXT:    [[TMP64]] = or <2 x i64> [[TMP80]], [[VEC_PHI18]]
+; CHECK-NEXT:    [[TMP65]] = or <2 x i64> [[TMP81]], [[VEC_PHI19]]
+; CHECK-NEXT:    [[TMP66]] = or <2 x i64> [[TMP82]], [[VEC_PHI20]]
+; CHECK-NEXT:    [[TMP67]] = or <2 x i64> [[TMP83]], [[VEC_PHI21]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 24
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD_11]], splat (i64 2)
 ; CHECK-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP51]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -103,7 +135,11 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[BIN_RDX20:%.*]] = or <2 x i64> [[TMP47]], [[BIN_RDX19]]
 ; CHECK-NEXT:    [[BIN_RDX21:%.*]] = or <2 x i64> [[TMP48]], [[BIN_RDX20]]
 ; CHECK-NEXT:    [[BIN_RDX22:%.*]] = or <2 x i64> [[TMP49]], [[BIN_RDX21]]
-; CHECK-NEXT:    [[BIN_RDX37:%.*]] = or <2 x i64> [[TMP50]], [[BIN_RDX22]]
+; CHECK-NEXT:    [[BIN_RDX31:%.*]] = or <2 x i64> [[TMP50]], [[BIN_RDX22]]
+; CHECK-NEXT:    [[BIN_RDX32:%.*]] = or <2 x i64> [[TMP64]], [[BIN_RDX31]]
+; CHECK-NEXT:    [[BIN_RDX33:%.*]] = or <2 x i64> [[TMP65]], [[BIN_RDX32]]
+; CHECK-NEXT:    [[BIN_RDX34:%.*]] = or <2 x i64> [[TMP66]], [[BIN_RDX33]]
+; CHECK-NEXT:    [[BIN_RDX37:%.*]] = or <2 x i64> [[TMP67]], [[BIN_RDX34]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[BIN_RDX37]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
@@ -144,14 +180,14 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[CMP_N33:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC25]]
 ; CHECK-NEXT:    br i1 [[CMP_N33]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL34:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX35:%.*]] = phi i64 [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL36:%.*]] = phi ptr [ [[TMP56]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL46:%.*]] = phi i64 [ [[N_VEC25]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX47:%.*]] = phi i64 [ [[TMP55]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP52]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL48:%.*]] = phi ptr [ [[TMP56]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL34]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX35]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL36]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL46]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX47]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL48]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[TMP53:%.*]] = load i8, ptr [[PTR_IV]], align 1
 ; CHECK-NEXT:    [[CONV3:%.*]] = zext i8 [[TMP53]] to i64
 ; CHECK-NEXT:    [[MUL:%.*]] = shl i64 [[IV]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
index ba073dc1590d165..fa16dfaa6a04653 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
@@ -4,10 +4,6 @@
 ; CHECK-NEXT: fadd
 ; CHECK-NEXT: fadd
 ; CHECK-NEXT: fadd
-; CHECK-NEXT: fadd
-; CHECK-NEXT: fadd
-; CHECK-NEXT: fadd
-; CHECK-NEXT: fadd
 ; CHECK-NEXT: =
 ; CHECK-NOT: fadd
 ; CHECK-SAME: >
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
index db4b580a3967718..280b3af04a4db3c 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
@@ -132,7 +132,7 @@ define float @float_(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %
 ;CHECK-LABEL: float_
 ;CHECK: LV(REG): VF = 1
 ;CHECK: LV(REG): Found max usage: 2 item
-;CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
+;CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 3 registers
 ;CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 3 registers
 ;CHECK: LV(REG): Found invariant usage: 1 item
 ;CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers
@@ -175,14 +175,14 @@ define void @double_(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
 ;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
 ;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 5 registers
 ;CHECK-PWR8: LV(REG): Found invariant usage: 1 item
-;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers
+;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
 
 ;CHECK-PWR9: LV(REG): VF = 1
 ;CHECK-PWR9: LV(REG): Found max usage: 2 item
-;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 3 registers
+;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
 ;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 5 registers
 ;CHECK-PWR9: LV(REG): Found invariant usage: 1 item
-;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers
+;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
 
   %1 = sext i32 %n to i64
   br label %2
@@ -248,8 +248,12 @@ define void @fp16_(ptr nocapture readonly %pIn, ptr nocapture %pOut, i32 %numRow
 ;CHECK-LABEL: fp16_
 ;CHECK: LV(REG): VF = 1
 ;CHECK: LV(REG): Found max usage: 2 item
-;CHECK: LV(REG): RegisterClass: PPC::GPRRC, 5 registers
+;CHECK: LV(REG): RegisterClass: PPC::GPRRC, 3 registers
 ;CHECK: LV(REG): RegisterClass: PPC::VSXRC, 2 registers
+;CHECK: LV(REG): Found invariant usage: 2 item
+;CHECK: LV(REG): RegisterClass: PPC::GPRRC, 1 registers
+;CHECK: LV(REG): RegisterClass: PPC::VSXRC, 1 registers
+
 entry:
   %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
   %0 = bitcast i16 %tmp.0.extract.trunc to half
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
index 40d6e8bc334712c..4e3077cfcab6736 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-bf16.ll
@@ -4,7 +4,7 @@
 define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
 ; CHECK-LABEL: add
 ; CHECK:       LV(REG): Found max usage: 2 item
-; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
index e07c7b6b40729cb..8825065aa5fe844 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-f16.ll
@@ -5,12 +5,12 @@
 define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
 ; CHECK-LABEL: add
 ; ZVFH:       LV(REG): Found max usage: 2 item
-; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; ZVFH-NEXT:  LV(REG): Found invariant usage: 1 item
 ; ZVFH-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; ZVFHMIN:       LV(REG): Found max usage: 2 item
-; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; ZVFHMIN-NEXT:  LV(REG): Found invariant usage: 1 item
 ; ZVFHMIN-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
index a2f55e49c9e0e41..9585d0d6d6cfde1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage.ll
@@ -23,27 +23,27 @@
 define void @add(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i32 signext %size, ptr noalias nocapture writeonly %result) {
 ; CHECK-LABEL: add
 ; CHECK-SCALAR:      LV(REG): Found max usage: 2 item
-; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::FPRRC, 2 registers
 ; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-LMUL1:       LV(REG): Found max usage: 2 item
-; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 2 registers
 ; CHECK-LMUL1-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL1-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-LMUL2:       LV(REG): Found max usage: 2 item
-; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 4 registers
 ; CHECK-LMUL2-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL2-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-LMUL4:       LV(REG): Found max usage: 2 item
-; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 8 registers
 ; CHECK-LMUL4-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL4-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
 ; CHECK-LMUL8:       LV(REG): Found max usage: 2 item
-; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 2 registers
+; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
 ; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::VRRC, 16 registers
 ; CHECK-LMUL8-NEXT:  LV(REG): Found invariant usage: 1 item
 ; CHECK-LMUL8-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 1 registers
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 4e563c242ee848c..c2eae15b9d200ff 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -125,12 +125,15 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): At #2 Interval # 2
 ; CHECK-NEXT:  LV(REG): At #3 Interval # 2
 ; CHECK-NEXT:  LV(REG): At #4 Interval # 2
-; CHECK-NEXT:  LV(REG): At #5 Interval # 3
+; CHECK-NEXT:  LV(REG): At #5 Interval # 2
 ; CHECK-NEXT:  LV(REG): At #6 Interval # 3
 ; CHECK-NEXT:  LV(REG): At #7 Interval # 3
 ; CHECK-NEXT:  LV(REG): At #8 Interval # 3
-; CHECK-NEXT:  LV(REG): At #9 Interval # 2
+; CHECK-NEXT:  LV(REG): At #9 Interval # 3
 ; CHECK-NEXT:  LV(REG): At #10 Interval # 3
+; CHECK-NEXT:  LV(REG): At #11 Interval # 3
+; CHECK-NEXT:  LV(REG): At #12 Interval # 2
+; CHECK-NEXT:  LV(REG): At #13 Interval # 2
 ; CHECK-NEXT:  LV(REG): VF = vscale x 4
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
@@ -375,12 +378,15 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV(REG): At #2 Interval # 2
 ; CHECK-NEXT:  LV(REG): At #3 Interval # 2
 ; CHECK-NEXT:  LV(REG): At #4 Interval # 2
-; CHECK-NEXT:  LV(REG): At #5 Interval # 3
+; CHECK-NEXT:  LV(REG): At #5 Interval # 2
 ; CHECK-NEXT:  LV(REG): At #6 Interval # 3
 ; CHECK-NEXT:  LV(REG): At #7 Interval # 3
 ; CHECK-NEXT:  LV(REG): At #8 Interval # 3
-; CHECK-NEXT:  LV(REG): At #9 Interval # 2
+; CHECK-NEXT:  LV(REG): At #9 Interval # 3
 ; CHECK-NEXT:  LV(REG): At #10 Interval # 3
+; CHECK-NEXT:  LV(REG): At #11 Interval # 3
+; CHECK-NEXT:  LV(REG): At #12 Interval # 2
+; CHECK-NEXT:  LV(REG): At #13 Interval # 2
 ; CHECK-NEXT:  LV(REG): VF = vscale x 4
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: RISCV::GPRRC, 3 registers
diff --git a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
index ce0fc350246e456..3445d4ceff5ecfe 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/i1-reg-usage.ll
@@ -8,8 +8,8 @@ target triple = "x86_64"
 ; CHECK-LABEL: LV: Checking a loop in 'or_reduction_avx' from <stdin>
 ; CHECK: LV(REG): VF = 64
 ; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
-; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
 
 define i1 @or_reduction_avx(i32 %arg, ptr %ptr) "target-features"="+avx" {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
index 7b1c7ae94ff41a2..a1b93aaa258d534 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
@@ -173,144 +173,66 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; AVX1-NEXT:  entry:
 ; AVX1-NEXT:    [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; AVX1-NEXT:    br i1 [[CMP30]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; AVX1:       iter.check:
+; AVX1:       for.body.preheader:
 ; AVX1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; AVX1-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
-; AVX1-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; AVX1:       vector.main.loop.iter.check:
-; AVX1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; AVX1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
 ; AVX1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; AVX1:       vector.ph:
-; AVX1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; AVX1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
 ; AVX1-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; AVX1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
-; AVX1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; AVX1-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP0]], 1
-; AVX1-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP1]], 1
-; AVX1-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP2]], 1
-; AVX1-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP3]], 1
-; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP4]]
-; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP5]]
-; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP6]]
+; AVX1-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; AVX1-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP6]]
 ; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP7]]
-; AVX1-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i16>, ptr [[TMP8]], align 2
-; AVX1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i16>, ptr [[TMP9]], align 2
-; AVX1-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC1]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[TMP16:%.*]] = sext <4 x i16> [[STRIDED_VEC]] to <4 x i32>
-; AVX1-NEXT:    [[TMP17:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
-; AVX1-NEXT:    [[TMP18:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32>
-; AVX1-NEXT:    [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32>
-; AVX1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP4]]
-; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP5]]
-; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]]
+; AVX1-NEXT:    [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32>
+; AVX1-NEXT:    [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32>
+; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP6]]
 ; AVX1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP7]]
-; AVX1-NEXT:    [[WIDE_VEC11:%.*]] = load <8 x i16>, ptr [[TMP20]], align 2
-; AVX1-NEXT:    [[STRIDED_VEC15:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC11]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[WIDE_VEC12:%.*]] = load <8 x i16>, ptr [[TMP21]], align 2
-; AVX1-NEXT:    [[STRIDED_VEC16:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC12]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP22]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP23]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC15]] to <4 x i32>
-; AVX1-NEXT:    [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC16]] to <4 x i32>
-; AVX1-NEXT:    [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32>
-; AVX1-NEXT:    [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32>
-; AVX1-NEXT:    [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP28]], [[TMP16]]
-; AVX1-NEXT:    [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP29]], [[TMP17]]
-; AVX1-NEXT:    [[TMP34:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP18]]
-; AVX1-NEXT:    [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP19]]
-; AVX1-NEXT:    [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
-; AVX1-NEXT:    [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
+; AVX1-NEXT:    [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32>
+; AVX1-NEXT:    [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32>
+; AVX1-NEXT:    [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]]
+; AVX1-NEXT:    [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]]
 ; AVX1-NEXT:    [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
-; AVX1-NEXT:    [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32>
-; AVX1-NEXT:    [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
-; AVX1-NEXT:    [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]]
-; AVX1-NEXT:    [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]]
 ; AVX1-NEXT:    [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP38]]
 ; AVX1-NEXT:    [[TMP47:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP39]]
-; AVX1-NEXT:    [[TMP48:%.*]] = add nsw <4 x i32> [[TMP44]], [[TMP32]]
-; AVX1-NEXT:    [[TMP49:%.*]] = add nsw <4 x i32> [[TMP45]], [[TMP33]]
-; AVX1-NEXT:    [[TMP50:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP34]]
-; AVX1-NEXT:    [[TMP51:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP35]]
+; AVX1-NEXT:    [[TMP48:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP44]]
+; AVX1-NEXT:    [[TMP49:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP45]]
 ; AVX1-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]]
 ; AVX1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 0
 ; AVX1-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 4
-; AVX1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 8
-; AVX1-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 12
 ; AVX1-NEXT:    store <4 x i32> [[TMP48]], ptr [[TMP56]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP49]], ptr [[TMP57]], align 4
-; AVX1-NEXT:    store <4 x i32> [[TMP50]], ptr [[TMP58]], align 4
-; AVX1-NEXT:    store <4 x i32> [[TMP51]], ptr [[TMP59]], align 4
-; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX1-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; AVX1:       vec.epilog.iter.check:
-; AVX1-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; AVX1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
-; AVX1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]]
-; AVX1:       vec.epilog.ph:
-; AVX1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; AVX1-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
-; AVX1-NEXT:    [[N_VEC25:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF24]]
-; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
-; AVX1:       vec.epilog.vector.body:
-; AVX1-NEXT:    [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDEX_NEXT33:%.*]], [[FOR_BODY]] ]
-; AVX1-NEXT:    [[TMP67:%.*]] = add i64 [[INDEX26]], 0
-; AVX1-NEXT:    [[TMP68:%.*]] = shl nuw nsw i64 [[TMP67]], 1
-; AVX1-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP68]]
-; AVX1-NEXT:    [[WIDE_VEC27:%.*]] = load <8 x i16>, ptr [[TMP69]], align 2
-; AVX1-NEXT:    [[STRIDED_VEC28:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC29:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[TMP53:%.*]] = sext <4 x i16> [[STRIDED_VEC28]] to <4 x i32>
-; AVX1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP68]]
-; AVX1-NEXT:    [[WIDE_VEC30:%.*]] = load <8 x i16>, ptr [[TMP54]], align 2
-; AVX1-NEXT:    [[STRIDED_VEC31:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; AVX1-NEXT:    [[STRIDED_VEC32:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[TMP55:%.*]] = sext <4 x i16> [[STRIDED_VEC31]] to <4 x i32>
-; AVX1-NEXT:    [[TMP70:%.*]] = mul nsw <4 x i32> [[TMP55]], [[TMP53]]
-; AVX1-NEXT:    [[TMP71:%.*]] = sext <4 x i16> [[STRIDED_VEC29]] to <4 x i32>
-; AVX1-NEXT:    [[TMP72:%.*]] = sext <4 x i16> [[STRIDED_VEC32]] to <4 x i32>
-; AVX1-NEXT:    [[TMP73:%.*]] = mul nsw <4 x i32> [[TMP72]], [[TMP71]]
-; AVX1-NEXT:    [[TMP74:%.*]] = add nsw <4 x i32> [[TMP73]], [[TMP70]]
-; AVX1-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP67]]
-; AVX1-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[TMP75]], i32 0
-; AVX1-NEXT:    store <4 x i32> [[TMP74]], ptr [[TMP76]], align 4
-; AVX1-NEXT:    [[INDEX_NEXT33]] = add nuw i64 [[INDEX26]], 4
-; AVX1-NEXT:    [[TMP77:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC25]]
-; AVX1-NEXT:    br i1 [[TMP77]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; AVX1:       vec.epilog.middle.block:
-; AVX1-NEXT:    [[CMP_N34:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC25]]
-; AVX1-NEXT:    br i1 [[CMP_N34]], label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-; AVX1:       vec.epilog.scalar.ph:
-; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; AVX1:       scalar.ph:
+; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; AVX1-NEXT:    br label [[FOR_BODY1:%.*]]
 ; AVX1:       for.body:
-; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
 ; AVX1-NEXT:    [[TMP61:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP61]]
 ; AVX1-NEXT:    [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
@@ -332,7 +254,7 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; AVX1-NEXT:    store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]]
+; AVX1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
 ; AVX1:       for.end.loopexit:
 ; AVX1-NEXT:    br label [[FOR_END]]
 ; AVX1:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
index f6191cc53c9718b..784b030bf3ab39e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reg-usage.ll
@@ -94,9 +94,11 @@ define i64 @bar(ptr nocapture %a) {
 ; CHECK-LABEL: bar
 ; CHECK:       LV(REG): VF = 2
 ; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 3 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 1 item
 ; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
-; CHECK-NEXT: LV(REG): Found invariant usage: 0 item
+
 
 entry:
   br label %for.body