[llvm] [VPlan] Track VPValues instead of VPRecipes in calculateRegisterUsage. (PR #155301)

Fri Sep 12 08:53:05 PDT 2025

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/155301

>From b2422e73955a6b9a4bbb5a3de7a6ec5391161fb6 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Mon, 25 Aug 2025 22:22:04 +0100
Subject: [PATCH 1/2] [VPlan] Track VPValues instead of VPRecipes in
 calculateRegisterUsage.

Update calculateRegisterUsageForPlan to track live-ness of VPValues
instead of recipes. This gives slightly more accurate results for
recipes that define multiple values (i.e. VPInterleaveRecipe).

When tracking the live-ness of recipes, all VPValues defined by an
VPInterleaveRecipe are considered alive until the last use of any of
them. When tracking the live-ness of individual VPValues, we can
accurately track the individual values until their last use.

Note the changes in large-loop-rdx.ll and pr47437.ll. This patch
restores the original behavior before introducing VPlan-based liveness
tracking.
---
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  74 ++---
 .../LoopVectorize/AArch64/reg-usage.ll        |   2 +-
 .../LoopVectorize/PowerPC/large-loop-rdx.ll   | 262 ++++++++++++++----
 .../Transforms/LoopVectorize/X86/pr47437.ll   | 108 ++++++--
 4 files changed, 338 insertions(+), 108 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index d400ceff7797c..03ca5adf738de 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -10,6 +10,7 @@
 #include "VPlan.h"
 #include "VPlanCFG.h"
 #include "VPlanDominatorTree.h"
+#include "VPlanHelpers.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -396,7 +397,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
 
 /// Get the VF scaling factor applied to the recipe's output, if the recipe has
 /// one.
-static unsigned getVFScaleFactor(VPRecipeBase *R) {
+static unsigned getVFScaleFactor(VPValue *R) {
   if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
     return RR->getVFScaleFactor();
   if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
@@ -422,15 +423,15 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
     const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
   // Each 'key' in the map opens a new interval. The values
   // of the map are the index of the 'last seen' usage of the
-  // recipe that is the key.
-  using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned, 16>;
+  // VPValue that is the key.
+  using IntervalMap = SmallDenseMap<VPValue *, unsigned, 16>;
 
   // Maps indices to recipes.
   SmallVector<VPRecipeBase *, 64> Idx2Recipe;
   // Marks the end of each interval.
   IntervalMap EndPoint;
-  // Saves the list of recipe indices that are used in the loop.
-  SmallPtrSet<VPRecipeBase *, 8> Ends;
+  // Saves the list of VPValues that are used in the loop.
+  SmallPtrSet<VPValue *, 8> Ends;
   // Saves the list of values that are used in the loop but are defined outside
   // the loop (not including non-recipe values such as arguments and
   // constants).
@@ -441,7 +442,7 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
   // each recipe. We use RPO to ensure that defs are met before their users. We
   // assume that each recipe that has in-loop users starts an interval. We
   // record every time that an in-loop value is used, so we have a list of the
-  // first and last occurrences of each recipe.
+  // first occurences of each recipe and last occurrence of each VPValue.
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
   ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
       LoopRegion);
@@ -470,32 +471,32 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
         }
 
         // Overwrite previous end points.
-        EndPoint[DefR] = Idx2Recipe.size();
-        Ends.insert(DefR);
+        EndPoint[U] = Idx2Recipe.size();
+        Ends.insert(U);
       }
     }
     if (VPBB == LoopRegion->getExiting()) {
       // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
       // exiting block, where their increment will get materialized eventually.
       for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
-        if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
-          EndPoint[&R] = Idx2Recipe.size();
-          Ends.insert(&R);
+        if (auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+          EndPoint[WideIV] = Idx2Recipe.size();
+          Ends.insert(WideIV);
         }
       }
     }
   }
 
   // Saves the list of intervals that end with the index in 'key'.
-  using RecipeList = SmallVector<VPRecipeBase *, 2>;
-  SmallDenseMap<unsigned, RecipeList, 16> TransposeEnds;
+  using VPValueList = SmallVector<VPValue *, 2>;
+  SmallDenseMap<unsigned, VPValueList, 16> TransposeEnds;
 
   // Next, we transpose the EndPoints into a multi map that holds the list of
   // intervals that *end* at a specific location.
   for (auto &Interval : EndPoint)
     TransposeEnds[Interval.second].push_back(Interval.first);
 
-  SmallPtrSet<VPRecipeBase *, 8> OpenIntervals;
+  SmallPtrSet<VPValue *, 8> OpenIntervals;
   SmallVector<VPRegisterUsage, 8> RUs(VFs.size());
   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
 
@@ -519,14 +520,16 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
   for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) {
     VPRecipeBase *R = Idx2Recipe[Idx];
 
-    // Remove all of the recipes that end at this location.
-    RecipeList &List = TransposeEnds[Idx];
-    for (VPRecipeBase *ToRemove : List)
+    // Remove all of the VPValues that end at this location.
+    VPValueList &List = TransposeEnds[Idx];
+    for (VPValue *ToRemove : List)
       OpenIntervals.erase(ToRemove);
 
     // Ignore recipes that are never used within the loop and do not have side
     // effects.
-    if (!Ends.count(R) && !R->mayHaveSideEffects())
+    if (all_of(R->definedValues(),
+               [&Ends](VPValue *Def) { return !Ends.count(Def); }) &&
+        !R->mayHaveSideEffects())
       continue;
 
     // Skip recipes for ignored values.
@@ -546,41 +549,38 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
       // there is no previous entry for ClassID.
       SmallMapVector<unsigned, unsigned, 4> RegUsage;
 
-      for (auto *R : OpenIntervals) {
-        // Skip recipes that weren't present in the original loop.
+      for (auto *VPV : OpenIntervals) {
+        // Skip values that weren't present in the original loop.
         // TODO: Remove after removing the legacy
         // LoopVectorizationCostModel::calculateRegisterUsage
         if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
-                VPBranchOnMaskRecipe>(R))
+                VPBranchOnMaskRecipe>(VPV))
           continue;
 
         if (VFs[J].isScalar() ||
             isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
-                VPEVLBasedIVPHIRecipe, VPScalarIVStepsRecipe>(R) ||
-            (isa<VPInstruction>(R) &&
-             vputils::onlyScalarValuesUsed(cast<VPSingleDefRecipe>(R))) ||
-            (isa<VPReductionPHIRecipe>(R) &&
-             (cast<VPReductionPHIRecipe>(R))->isInLoop())) {
-          unsigned ClassID = TTI.getRegisterClassForType(
-              false, TypeInfo.inferScalarType(R->getVPSingleValue()));
+                VPEVLBasedIVPHIRecipe, VPScalarIVStepsRecipe>(VPV) ||
+            (isa<VPInstruction>(VPV) && vputils::onlyScalarValuesUsed(VPV)) ||
+            (isa<VPReductionPHIRecipe>(VPV) &&
+             (cast<VPReductionPHIRecipe>(VPV))->isInLoop())) {
+          unsigned ClassID =
+              TTI.getRegisterClassForType(false, TypeInfo.inferScalarType(VPV));
           // FIXME: The target might use more than one register for the type
           // even in the scalar case.
           RegUsage[ClassID] += 1;
         } else {
           // The output from scaled phis and scaled reductions actually has
           // fewer lanes than the VF.
-          unsigned ScaleFactor = getVFScaleFactor(R);
+          unsigned ScaleFactor = getVFScaleFactor(VPV);
           ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
           LLVM_DEBUG(if (VF != VFs[J]) {
             dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
                    << " for " << *R << "\n";
           });
 
-          for (VPValue *DefV : R->definedValues()) {
-            Type *ScalarTy = TypeInfo.inferScalarType(DefV);
-            unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
-            RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
-          }
+          Type *ScalarTy = TypeInfo.inferScalarType(VPV);
+          unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
+          RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
         }
       }
 
@@ -593,8 +593,10 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
     LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
                       << OpenIntervals.size() << '\n');
 
-    // Add the current recipe to the list of open intervals.
-    OpenIntervals.insert(R);
+    // Add the VPValues defined by the current recipe to the list of open
+    // intervals.
+    for (VPValue *DefV : R->definedValues())
+      OpenIntervals.insert(DefV);
   }
 
   // We also search for instructions that are defined outside the loop, but are
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
index 01d103264fafe..c61361bb3df76 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -74,7 +74,7 @@ define dso_local void @dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum, i32
 ; CHECK:       LV(REG): VF = 16
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: Generic::ScalarRC, 3 registers
-; CHECK-NEXT:  LV(REG): RegisterClass: Generic::VectorRC, 48 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: Generic::VectorRC, 47 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
 entry:
   %cmp100 = icmp sgt i32 %n, 0
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
index 0b23206134bc0..43cce8005bbf6 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll
@@ -10,28 +10,43 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) {
 ; CHECK-SAME: ptr noalias [[R:%.*]], ptr noalias [[A:%.*]], i32 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[CMP24:%.*]] = icmp sgt i32 [[N]], 0
-; CHECK-NEXT:    br i1 [[CMP24]], label %[[FOR_COND1_PREHEADER_PREHEADER:.*]], label %[[FOR_END13:.*]]
-; CHECK:       [[FOR_COND1_PREHEADER_PREHEADER]]:
+; CHECK-NEXT:    br i1 [[CMP24]], label %[[ITER_CHECK:.*]], label %[[FOR_END13:.*]]
+; CHECK:       [[ITER_CHECK]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP69:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP67:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP65:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP66:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP131:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP25:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP132:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP133:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP134:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = phi <2 x double> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP135:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 10
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 12
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 14
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDEX]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP1]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP2]], i64 0, i32 0
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP3]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP4]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP5]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP6]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[TMP7]], i64 0, i32 0
 ; CHECK-NEXT:    [[WIDE_VEC35:%.*]] = load <12 x float>, ptr [[TMP13]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC36:%.*]] = shufflevector <12 x float> [[WIDE_VEC35]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC37:%.*]] = shufflevector <12 x float> [[WIDE_VEC35]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
@@ -60,116 +75,252 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) {
 ; CHECK-NEXT:    [[STRIDED_VEC60:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC61:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC62:%.*]] = shufflevector <12 x float> [[WIDE_VEC56]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT:    [[WIDE_VEC36:%.*]] = load <12 x float>, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC42:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC49:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC56:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT:    [[STRIDED_VEC63:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC64:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC65:%.*]] = shufflevector <12 x float> [[WIDE_VEC36]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT:    [[WIDE_VEC43:%.*]] = load <12 x float>, ptr [[TMP17]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC66:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC67:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC68:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT:    [[STRIDED_VEC69:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC70:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC71:%.*]] = shufflevector <12 x float> [[WIDE_VEC43]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT:    [[WIDE_VEC50:%.*]] = load <12 x float>, ptr [[TMP18]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC72:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC73:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC80:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT:    [[STRIDED_VEC81:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC82:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC83:%.*]] = shufflevector <12 x float> [[WIDE_VEC50]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT:    [[WIDE_VEC57:%.*]] = load <12 x float>, ptr [[TMP19]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC84:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC85:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC86:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT:    [[STRIDED_VEC87:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC88:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC89:%.*]] = shufflevector <12 x float> [[WIDE_VEC57]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
 ; CHECK-NEXT:    [[TMP64:%.*]] = fmul fast <2 x float> [[STRIDED_VEC36]], [[STRIDED_VEC36]]
 ; CHECK-NEXT:    [[TMP97:%.*]] = fmul fast <2 x float> [[STRIDED_VEC43]], [[STRIDED_VEC43]]
 ; CHECK-NEXT:    [[TMP98:%.*]] = fmul fast <2 x float> [[STRIDED_VEC50]], [[STRIDED_VEC50]]
 ; CHECK-NEXT:    [[TMP99:%.*]] = fmul fast <2 x float> [[STRIDED_VEC57]], [[STRIDED_VEC57]]
+; CHECK-NEXT:    [[TMP100:%.*]] = fmul fast <2 x float> [[STRIDED_VEC42]], [[STRIDED_VEC42]]
+; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast <2 x float> [[STRIDED_VEC66]], [[STRIDED_VEC66]]
+; CHECK-NEXT:    [[TMP102:%.*]] = fmul fast <2 x float> [[STRIDED_VEC72]], [[STRIDED_VEC72]]
+; CHECK-NEXT:    [[TMP103:%.*]] = fmul fast <2 x float> [[STRIDED_VEC84]], [[STRIDED_VEC84]]
 ; CHECK-NEXT:    [[TMP72:%.*]] = fmul fast <2 x float> [[STRIDED_VEC37]], [[STRIDED_VEC37]]
 ; CHECK-NEXT:    [[TMP105:%.*]] = fmul fast <2 x float> [[STRIDED_VEC44]], [[STRIDED_VEC44]]
 ; CHECK-NEXT:    [[TMP106:%.*]] = fmul fast <2 x float> [[STRIDED_VEC51]], [[STRIDED_VEC51]]
 ; CHECK-NEXT:    [[TMP107:%.*]] = fmul fast <2 x float> [[STRIDED_VEC58]], [[STRIDED_VEC58]]
+; CHECK-NEXT:    [[TMP108:%.*]] = fmul fast <2 x float> [[STRIDED_VEC49]], [[STRIDED_VEC49]]
+; CHECK-NEXT:    [[TMP109:%.*]] = fmul fast <2 x float> [[STRIDED_VEC67]], [[STRIDED_VEC67]]
+; CHECK-NEXT:    [[TMP110:%.*]] = fmul fast <2 x float> [[STRIDED_VEC73]], [[STRIDED_VEC73]]
+; CHECK-NEXT:    [[TMP111:%.*]] = fmul fast <2 x float> [[STRIDED_VEC85]], [[STRIDED_VEC85]]
 ; CHECK-NEXT:    [[TMP80:%.*]] = fadd fast <2 x float> [[TMP72]], [[TMP64]]
 ; CHECK-NEXT:    [[TMP113:%.*]] = fadd fast <2 x float> [[TMP105]], [[TMP97]]
 ; CHECK-NEXT:    [[TMP114:%.*]] = fadd fast <2 x float> [[TMP106]], [[TMP98]]
 ; CHECK-NEXT:    [[TMP115:%.*]] = fadd fast <2 x float> [[TMP107]], [[TMP99]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fpext <2 x float> [[TMP80]] to <2 x double>
-; CHECK-NEXT:    [[TMP22:%.*]] = fpext <2 x float> [[TMP113]] to <2 x double>
-; CHECK-NEXT:    [[TMP23:%.*]] = fpext <2 x float> [[TMP114]] to <2 x double>
-; CHECK-NEXT:    [[TMP24:%.*]] = fpext <2 x float> [[TMP115]] to <2 x double>
-; CHECK-NEXT:    [[TMP25:%.*]] = fadd fast <2 x double> [[TMP21]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fadd fast <2 x double> [[TMP22]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <2 x double> [[TMP23]], [[VEC_PHI2]]
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <2 x double> [[TMP24]], [[VEC_PHI3]]
-; CHECK-NEXT:    [[TMP100:%.*]] = fmul fast <2 x float> [[STRIDED_VEC38]], [[STRIDED_VEC38]]
-; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast <2 x float> [[STRIDED_VEC45]], [[STRIDED_VEC45]]
-; CHECK-NEXT:    [[TMP102:%.*]] = fmul fast <2 x float> [[STRIDED_VEC52]], [[STRIDED_VEC52]]
-; CHECK-NEXT:    [[TMP103:%.*]] = fmul fast <2 x float> [[STRIDED_VEC59]], [[STRIDED_VEC59]]
-; CHECK-NEXT:    [[TMP108:%.*]] = fmul fast <2 x float> [[STRIDED_VEC39]], [[STRIDED_VEC39]]
-; CHECK-NEXT:    [[TMP109:%.*]] = fmul fast <2 x float> [[STRIDED_VEC46]], [[STRIDED_VEC46]]
-; CHECK-NEXT:    [[TMP110:%.*]] = fmul fast <2 x float> [[STRIDED_VEC53]], [[STRIDED_VEC53]]
-; CHECK-NEXT:    [[TMP111:%.*]] = fmul fast <2 x float> [[STRIDED_VEC60]], [[STRIDED_VEC60]]
 ; CHECK-NEXT:    [[TMP116:%.*]] = fadd fast <2 x float> [[TMP108]], [[TMP100]]
 ; CHECK-NEXT:    [[TMP117:%.*]] = fadd fast <2 x float> [[TMP109]], [[TMP101]]
 ; CHECK-NEXT:    [[TMP118:%.*]] = fadd fast <2 x float> [[TMP110]], [[TMP102]]
 ; CHECK-NEXT:    [[TMP119:%.*]] = fadd fast <2 x float> [[TMP111]], [[TMP103]]
+; CHECK-NEXT:    [[TMP40:%.*]] = fpext <2 x float> [[TMP80]] to <2 x double>
+; CHECK-NEXT:    [[TMP52:%.*]] = fpext <2 x float> [[TMP113]] to <2 x double>
+; CHECK-NEXT:    [[TMP53:%.*]] = fpext <2 x float> [[TMP114]] to <2 x double>
+; CHECK-NEXT:    [[TMP54:%.*]] = fpext <2 x float> [[TMP115]] to <2 x double>
 ; CHECK-NEXT:    [[TMP41:%.*]] = fpext <2 x float> [[TMP116]] to <2 x double>
 ; CHECK-NEXT:    [[TMP42:%.*]] = fpext <2 x float> [[TMP117]] to <2 x double>
 ; CHECK-NEXT:    [[TMP43:%.*]] = fpext <2 x float> [[TMP118]] to <2 x double>
 ; CHECK-NEXT:    [[TMP44:%.*]] = fpext <2 x float> [[TMP119]] to <2 x double>
+; CHECK-NEXT:    [[TMP55:%.*]] = fadd fast <2 x double> [[TMP40]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP49:%.*]] = fadd fast <2 x double> [[TMP52]], [[VEC_PHI2]]
+; CHECK-NEXT:    [[TMP50:%.*]] = fadd fast <2 x double> [[TMP53]], [[VEC_PHI3]]
+; CHECK-NEXT:    [[TMP51:%.*]] = fadd fast <2 x double> [[TMP54]], [[VEC_PHI4]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = fadd fast <2 x double> [[TMP41]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = fadd fast <2 x double> [[TMP42]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = fadd fast <2 x double> [[TMP43]], [[TMP27]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = fadd fast <2 x double> [[TMP44]], [[TMP28]]
-; CHECK-NEXT:    [[TMP104:%.*]] = fmul fast <2 x float> [[STRIDED_VEC40]], [[STRIDED_VEC40]]
-; CHECK-NEXT:    [[TMP142:%.*]] = fmul fast <2 x float> [[STRIDED_VEC47]], [[STRIDED_VEC47]]
-; CHECK-NEXT:    [[TMP147:%.*]] = fmul fast <2 x float> [[STRIDED_VEC54]], [[STRIDED_VEC54]]
-; CHECK-NEXT:    [[TMP152:%.*]] = fmul fast <2 x float> [[STRIDED_VEC61]], [[STRIDED_VEC61]]
-; CHECK-NEXT:    [[TMP112:%.*]] = fmul fast <2 x float> [[STRIDED_VEC41]], [[STRIDED_VEC41]]
-; CHECK-NEXT:    [[TMP143:%.*]] = fmul fast <2 x float> [[STRIDED_VEC48]], [[STRIDED_VEC48]]
-; CHECK-NEXT:    [[TMP148:%.*]] = fmul fast <2 x float> [[STRIDED_VEC55]], [[STRIDED_VEC55]]
-; CHECK-NEXT:    [[TMP153:%.*]] = fmul fast <2 x float> [[STRIDED_VEC62]], [[STRIDED_VEC62]]
+; CHECK-NEXT:    [[TMP104:%.*]] = fmul fast <2 x float> [[STRIDED_VEC38]], [[STRIDED_VEC38]]
+; CHECK-NEXT:    [[TMP142:%.*]] = fmul fast <2 x float> [[STRIDED_VEC45]], [[STRIDED_VEC45]]
+; CHECK-NEXT:    [[TMP147:%.*]] = fmul fast <2 x float> [[STRIDED_VEC52]], [[STRIDED_VEC52]]
+; CHECK-NEXT:    [[TMP152:%.*]] = fmul fast <2 x float> [[STRIDED_VEC59]], [[STRIDED_VEC59]]
+; CHECK-NEXT:    [[TMP60:%.*]] = fmul fast <2 x float> [[STRIDED_VEC56]], [[STRIDED_VEC56]]
+; CHECK-NEXT:    [[TMP67:%.*]] = fmul fast <2 x float> [[STRIDED_VEC68]], [[STRIDED_VEC68]]
+; CHECK-NEXT:    [[TMP73:%.*]] = fmul fast <2 x float> [[STRIDED_VEC80]], [[STRIDED_VEC80]]
+; CHECK-NEXT:    [[TMP74:%.*]] = fmul fast <2 x float> [[STRIDED_VEC86]], [[STRIDED_VEC86]]
+; CHECK-NEXT:    [[TMP112:%.*]] = fmul fast <2 x float> [[STRIDED_VEC39]], [[STRIDED_VEC39]]
+; CHECK-NEXT:    [[TMP143:%.*]] = fmul fast <2 x float> [[STRIDED_VEC46]], [[STRIDED_VEC46]]
+; CHECK-NEXT:    [[TMP148:%.*]] = fmul fast <2 x float> [[STRIDED_VEC53]], [[STRIDED_VEC53]]
+; CHECK-NEXT:    [[TMP153:%.*]] = fmul fast <2 x float> [[STRIDED_VEC60]], [[STRIDED_VEC60]]
+; CHECK-NEXT:    [[TMP75:%.*]] = fmul fast <2 x float> [[STRIDED_VEC63]], [[STRIDED_VEC63]]
+; CHECK-NEXT:    [[TMP81:%.*]] = fmul fast <2 x float> [[STRIDED_VEC69]], [[STRIDED_VEC69]]
+; CHECK-NEXT:    [[TMP70:%.*]] = fmul fast <2 x float> [[STRIDED_VEC81]], [[STRIDED_VEC81]]
+; CHECK-NEXT:    [[TMP71:%.*]] = fmul fast <2 x float> [[STRIDED_VEC87]], [[STRIDED_VEC87]]
 ; CHECK-NEXT:    [[TMP120:%.*]] = fadd fast <2 x float> [[TMP112]], [[TMP104]]
 ; CHECK-NEXT:    [[TMP144:%.*]] = fadd fast <2 x float> [[TMP143]], [[TMP142]]
 ; CHECK-NEXT:    [[TMP149:%.*]] = fadd fast <2 x float> [[TMP148]], [[TMP147]]
 ; CHECK-NEXT:    [[TMP154:%.*]] = fadd fast <2 x float> [[TMP153]], [[TMP152]]
+; CHECK-NEXT:    [[TMP76:%.*]] = fadd fast <2 x float> [[TMP75]], [[TMP60]]
+; CHECK-NEXT:    [[TMP77:%.*]] = fadd fast <2 x float> [[TMP81]], [[TMP67]]
+; CHECK-NEXT:    [[TMP78:%.*]] = fadd fast <2 x float> [[TMP70]], [[TMP73]]
+; CHECK-NEXT:    [[TMP79:%.*]] = fadd fast <2 x float> [[TMP71]], [[TMP74]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = fpext <2 x float> [[TMP120]] to <2 x double>
 ; CHECK-NEXT:    [[TMP62:%.*]] = fpext <2 x float> [[TMP144]] to <2 x double>
 ; CHECK-NEXT:    [[TMP63:%.*]] = fpext <2 x float> [[TMP149]] to <2 x double>
 ; CHECK-NEXT:    [[TMP155:%.*]] = fpext <2 x float> [[TMP154]] to <2 x double>
-; CHECK-NEXT:    [[TMP69]] = fadd fast <2 x double> [[TMP61]], [[TMP45]]
-; CHECK-NEXT:    [[TMP65]] = fadd fast <2 x double> [[TMP62]], [[TMP46]]
-; CHECK-NEXT:    [[TMP66]] = fadd fast <2 x double> [[TMP63]], [[TMP47]]
-; CHECK-NEXT:    [[TMP67]] = fadd fast <2 x double> [[TMP155]], [[TMP48]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP84:%.*]] = fpext <2 x float> [[TMP76]] to <2 x double>
+; CHECK-NEXT:    [[TMP85:%.*]] = fpext <2 x float> [[TMP77]] to <2 x double>
+; CHECK-NEXT:    [[TMP86:%.*]] = fpext <2 x float> [[TMP78]] to <2 x double>
+; CHECK-NEXT:    [[TMP87:%.*]] = fpext <2 x float> [[TMP79]] to <2 x double>
+; CHECK-NEXT:    [[TMP88:%.*]] = fadd fast <2 x double> [[TMP61]], [[TMP55]]
+; CHECK-NEXT:    [[TMP89:%.*]] = fadd fast <2 x double> [[TMP62]], [[TMP49]]
+; CHECK-NEXT:    [[TMP90:%.*]] = fadd fast <2 x double> [[TMP63]], [[TMP50]]
+; CHECK-NEXT:    [[TMP91:%.*]] = fadd fast <2 x double> [[TMP155]], [[TMP51]]
+; CHECK-NEXT:    [[TMP92:%.*]] = fadd fast <2 x double> [[TMP84]], [[TMP45]]
+; CHECK-NEXT:    [[TMP93:%.*]] = fadd fast <2 x double> [[TMP85]], [[TMP46]]
+; CHECK-NEXT:    [[TMP94:%.*]] = fadd fast <2 x double> [[TMP86]], [[TMP47]]
+; CHECK-NEXT:    [[TMP95:%.*]] = fadd fast <2 x double> [[TMP87]], [[TMP48]]
+; CHECK-NEXT:    [[TMP96:%.*]] = fmul fast <2 x float> [[STRIDED_VEC40]], [[STRIDED_VEC40]]
+; CHECK-NEXT:    [[TMP128:%.*]] = fmul fast <2 x float> [[STRIDED_VEC47]], [[STRIDED_VEC47]]
+; CHECK-NEXT:    [[TMP129:%.*]] = fmul fast <2 x float> [[STRIDED_VEC54]], [[STRIDED_VEC54]]
+; CHECK-NEXT:    [[TMP130:%.*]] = fmul fast <2 x float> [[STRIDED_VEC61]], [[STRIDED_VEC61]]
+; CHECK-NEXT:    [[TMP136:%.*]] = fmul fast <2 x float> [[STRIDED_VEC64]], [[STRIDED_VEC64]]
+; CHECK-NEXT:    [[TMP137:%.*]] = fmul fast <2 x float> [[STRIDED_VEC70]], [[STRIDED_VEC70]]
+; CHECK-NEXT:    [[TMP139:%.*]] = fmul fast <2 x float> [[STRIDED_VEC82]], [[STRIDED_VEC82]]
+; CHECK-NEXT:    [[TMP157:%.*]] = fmul fast <2 x float> [[STRIDED_VEC88]], [[STRIDED_VEC88]]
+; CHECK-NEXT:    [[TMP159:%.*]] = fmul fast <2 x float> [[STRIDED_VEC41]], [[STRIDED_VEC41]]
+; CHECK-NEXT:    [[TMP160:%.*]] = fmul fast <2 x float> [[STRIDED_VEC48]], [[STRIDED_VEC48]]
+; CHECK-NEXT:    [[TMP161:%.*]] = fmul fast <2 x float> [[STRIDED_VEC55]], [[STRIDED_VEC55]]
+; CHECK-NEXT:    [[TMP162:%.*]] = fmul fast <2 x float> [[STRIDED_VEC62]], [[STRIDED_VEC62]]
+; CHECK-NEXT:    [[TMP163:%.*]] = fmul fast <2 x float> [[STRIDED_VEC65]], [[STRIDED_VEC65]]
+; CHECK-NEXT:    [[TMP164:%.*]] = fmul fast <2 x float> [[STRIDED_VEC71]], [[STRIDED_VEC71]]
+; CHECK-NEXT:    [[TMP165:%.*]] = fmul fast <2 x float> [[STRIDED_VEC83]], [[STRIDED_VEC83]]
+; CHECK-NEXT:    [[TMP166:%.*]] = fmul fast <2 x float> [[STRIDED_VEC89]], [[STRIDED_VEC89]]
+; CHECK-NEXT:    [[TMP167:%.*]] = fadd fast <2 x float> [[TMP159]], [[TMP96]]
+; CHECK-NEXT:    [[TMP168:%.*]] = fadd fast <2 x float> [[TMP160]], [[TMP128]]
+; CHECK-NEXT:    [[TMP169:%.*]] = fadd fast <2 x float> [[TMP161]], [[TMP129]]
+; CHECK-NEXT:    [[TMP170:%.*]] = fadd fast <2 x float> [[TMP162]], [[TMP130]]
+; CHECK-NEXT:    [[TMP171:%.*]] = fadd fast <2 x float> [[TMP163]], [[TMP136]]
+; CHECK-NEXT:    [[TMP172:%.*]] = fadd fast <2 x float> [[TMP164]], [[TMP137]]
+; CHECK-NEXT:    [[TMP173:%.*]] = fadd fast <2 x float> [[TMP165]], [[TMP139]]
+; CHECK-NEXT:    [[TMP174:%.*]] = fadd fast <2 x float> [[TMP166]], [[TMP157]]
+; CHECK-NEXT:    [[TMP175:%.*]] = fpext <2 x float> [[TMP167]] to <2 x double>
+; CHECK-NEXT:    [[TMP121:%.*]] = fpext <2 x float> [[TMP168]] to <2 x double>
+; CHECK-NEXT:    [[TMP122:%.*]] = fpext <2 x float> [[TMP169]] to <2 x double>
+; CHECK-NEXT:    [[TMP123:%.*]] = fpext <2 x float> [[TMP170]] to <2 x double>
+; CHECK-NEXT:    [[TMP124:%.*]] = fpext <2 x float> [[TMP171]] to <2 x double>
+; CHECK-NEXT:    [[TMP125:%.*]] = fpext <2 x float> [[TMP172]] to <2 x double>
+; CHECK-NEXT:    [[TMP126:%.*]] = fpext <2 x float> [[TMP173]] to <2 x double>
+; CHECK-NEXT:    [[TMP127:%.*]] = fpext <2 x float> [[TMP174]] to <2 x double>
+; CHECK-NEXT:    [[TMP69]] = fadd fast <2 x double> [[TMP175]], [[TMP88]]
+; CHECK-NEXT:    [[TMP65]] = fadd fast <2 x double> [[TMP121]], [[TMP89]]
+; CHECK-NEXT:    [[TMP66]] = fadd fast <2 x double> [[TMP122]], [[TMP90]]
+; CHECK-NEXT:    [[TMP131]] = fadd fast <2 x double> [[TMP123]], [[TMP91]]
+; CHECK-NEXT:    [[TMP132]] = fadd fast <2 x double> [[TMP124]], [[TMP92]]
+; CHECK-NEXT:    [[TMP133]] = fadd fast <2 x double> [[TMP125]], [[TMP93]]
+; CHECK-NEXT:    [[TMP134]] = fadd fast <2 x double> [[TMP126]], [[TMP94]]
+; CHECK-NEXT:    [[TMP135]] = fadd fast <2 x double> [[TMP127]], [[TMP95]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP68]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP65]], [[TMP69]]
 ; CHECK-NEXT:    [[BIN_RDX30:%.*]] = fadd fast <2 x double> [[TMP66]], [[BIN_RDX]]
-; CHECK-NEXT:    [[TMP156:%.*]] = fadd fast <2 x double> [[TMP67]], [[BIN_RDX30]]
+; CHECK-NEXT:    [[BIN_RDX64:%.*]] = fadd fast <2 x double> [[TMP131]], [[BIN_RDX30]]
+; CHECK-NEXT:    [[BIN_RDX65:%.*]] = fadd fast <2 x double> [[TMP132]], [[BIN_RDX64]]
+; CHECK-NEXT:    [[BIN_RDX66:%.*]] = fadd fast <2 x double> [[TMP133]], [[BIN_RDX65]]
+; CHECK-NEXT:    [[BIN_RDX67:%.*]] = fadd fast <2 x double> [[TMP134]], [[BIN_RDX66]]
+; CHECK-NEXT:    [[TMP156:%.*]] = fadd fast <2 x double> [[TMP135]], [[BIN_RDX67]]
 ; CHECK-NEXT:    [[TMP158:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP156]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_FOR_END13_CRIT_EDGE:.*]], label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP158]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_COND1_PREHEADER_PREHEADER]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_FOR_END13_CRIT_EDGE:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP158]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF69:%.*]] = urem i64 [[TMP0]], 2
+; CHECK-NEXT:    [[N_VEC70:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF69]]
+; CHECK-NEXT:    [[TMP138:%.*]] = insertelement <2 x double> zeroinitializer, double [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT80:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI72:%.*]] = phi <2 x double> [ [[TMP138]], %[[VEC_EPILOG_PH]] ], [ [[TMP176:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX5_REALP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 0
+; CHECK-NEXT:    [[WIDE_VEC73:%.*]] = load <12 x float>, ptr [[ARRAYIDX5_REALP]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC74:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 0, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC75:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 1, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC76:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 2, i32 8>
+; CHECK-NEXT:    [[STRIDED_VEC77:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 3, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC78:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 4, i32 10>
+; CHECK-NEXT:    [[STRIDED_VEC79:%.*]] = shufflevector <12 x float> [[WIDE_VEC73]], <12 x float> poison, <2 x i32> <i32 5, i32 11>
+; CHECK-NEXT:    [[TMP140:%.*]] = fmul fast <2 x float> [[STRIDED_VEC74]], [[STRIDED_VEC74]]
+; CHECK-NEXT:    [[TMP141:%.*]] = fmul fast <2 x float> [[STRIDED_VEC75]], [[STRIDED_VEC75]]
+; CHECK-NEXT:    [[TMP177:%.*]] = fadd fast <2 x float> [[TMP141]], [[TMP140]]
+; CHECK-NEXT:    [[TMP178:%.*]] = fpext <2 x float> [[TMP177]] to <2 x double>
+; CHECK-NEXT:    [[TMP179:%.*]] = fadd fast <2 x double> [[TMP178]], [[VEC_PHI72]]
+; CHECK-NEXT:    [[TMP145:%.*]] = fmul fast <2 x float> [[STRIDED_VEC76]], [[STRIDED_VEC76]]
+; CHECK-NEXT:    [[TMP146:%.*]] = fmul fast <2 x float> [[STRIDED_VEC77]], [[STRIDED_VEC77]]
+; CHECK-NEXT:    [[TMP180:%.*]] = fadd fast <2 x float> [[TMP146]], [[TMP145]]
+; CHECK-NEXT:    [[TMP181:%.*]] = fpext <2 x float> [[TMP180]] to <2 x double>
+; CHECK-NEXT:    [[TMP182:%.*]] = fadd fast <2 x double> [[TMP181]], [[TMP179]]
+; CHECK-NEXT:    [[TMP150:%.*]] = fmul fast <2 x float> [[STRIDED_VEC78]], [[STRIDED_VEC78]]
+; CHECK-NEXT:    [[TMP151:%.*]] = fmul fast <2 x float> [[STRIDED_VEC79]], [[STRIDED_VEC79]]
+; CHECK-NEXT:    [[TMP183:%.*]] = fadd fast <2 x float> [[TMP151]], [[TMP150]]
+; CHECK-NEXT:    [[TMP184:%.*]] = fpext <2 x float> [[TMP183]] to <2 x double>
+; CHECK-NEXT:    [[TMP176]] = fadd fast <2 x double> [[TMP184]], [[TMP182]]
+; CHECK-NEXT:    [[INDEX_NEXT80]] = add nuw i64 [[INDVARS_IV]], 2
+; CHECK-NEXT:    [[TMP185:%.*]] = icmp eq i64 [[INDEX_NEXT80]], [[N_VEC70]]
+; CHECK-NEXT:    br i1 [[TMP185]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP186:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP176]])
+; CHECK-NEXT:    [[CMP_N81:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC70]]
+; CHECK-NEXT:    br i1 [[CMP_N81]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC70]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX82:%.*]] = phi double [ [[TMP186]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP158]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER:.*]]
 ; CHECK:       [[FOR_COND1_PREHEADER]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[SUM_026:%.*]] = phi double [ [[ADD10_2:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[ARRAYIDX5_REALP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 0
-; CHECK-NEXT:    [[ARRAYIDX5_REAL:%.*]] = load float, ptr [[ARRAYIDX5_REALP]], align 8
-; CHECK-NEXT:    [[ARRAYIDX5_IMAGP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 0, i32 1
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUM_026:%.*]] = phi double [ [[ADD10_2:%.*]], %[[FOR_COND1_PREHEADER]] ], [ [[BC_MERGE_RDX82]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX5_REALP1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 0, i32 0
+; CHECK-NEXT:    [[ARRAYIDX5_REAL:%.*]] = load float, ptr [[ARRAYIDX5_REALP1]], align 8
+; CHECK-NEXT:    [[ARRAYIDX5_IMAGP:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 0, i32 1
 ; CHECK-NEXT:    [[ARRAYIDX5_IMAG:%.*]] = load float, ptr [[ARRAYIDX5_IMAGP]], align 8
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[ARRAYIDX5_REAL]], [[ARRAYIDX5_REAL]]
 ; CHECK-NEXT:    [[MUL9:%.*]] = fmul fast float [[ARRAYIDX5_IMAG]], [[ARRAYIDX5_IMAG]]
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL9]], [[MUL]]
 ; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[ADD]] to double
 ; CHECK-NEXT:    [[ADD10:%.*]] = fadd fast double [[CONV]], [[SUM_026]]
-; CHECK-NEXT:    [[ARRAYIDX5_REALP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 1, i32 0
+; CHECK-NEXT:    [[ARRAYIDX5_REALP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 1, i32 0
 ; CHECK-NEXT:    [[ARRAYIDX5_REAL_1:%.*]] = load float, ptr [[ARRAYIDX5_REALP_1]], align 8
-; CHECK-NEXT:    [[ARRAYIDX5_IMAGP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 1, i32 1
+; CHECK-NEXT:    [[ARRAYIDX5_IMAGP_1:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 1, i32 1
 ; CHECK-NEXT:    [[ARRAYIDX5_IMAG_1:%.*]] = load float, ptr [[ARRAYIDX5_IMAGP_1]], align 8
 ; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[ARRAYIDX5_REAL_1]], [[ARRAYIDX5_REAL_1]]
 ; CHECK-NEXT:    [[MUL9_1:%.*]] = fmul fast float [[ARRAYIDX5_IMAG_1]], [[ARRAYIDX5_IMAG_1]]
 ; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[MUL9_1]], [[MUL_1]]
 ; CHECK-NEXT:    [[CONV_1:%.*]] = fpext float [[ADD_1]] to double
 ; CHECK-NEXT:    [[ADD10_1:%.*]] = fadd fast double [[CONV_1]], [[ADD10]]
-; CHECK-NEXT:    [[ARRAYIDX5_REALP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 2, i32 0
+; CHECK-NEXT:    [[ARRAYIDX5_REALP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 2, i32 0
 ; CHECK-NEXT:    [[ARRAYIDX5_REAL_2:%.*]] = load float, ptr [[ARRAYIDX5_REALP_2]], align 8
-; CHECK-NEXT:    [[ARRAYIDX5_IMAGP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV]], i64 2, i32 1
+; CHECK-NEXT:    [[ARRAYIDX5_IMAGP_2:%.*]] = getelementptr inbounds [3 x { float, float }], ptr [[A]], i64 [[INDVARS_IV1]], i64 2, i32 1
 ; CHECK-NEXT:    [[ARRAYIDX5_IMAG_2:%.*]] = load float, ptr [[ARRAYIDX5_IMAGP_2]], align 8
 ; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[ARRAYIDX5_REAL_2]], [[ARRAYIDX5_REAL_2]]
 ; CHECK-NEXT:    [[MUL9_2:%.*]] = fmul fast float [[ARRAYIDX5_IMAG_2]], [[ARRAYIDX5_IMAG_2]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[MUL9_2]], [[MUL_2]]
 ; CHECK-NEXT:    [[CONV_2:%.*]] = fpext float [[ADD_2]] to double
 ; CHECK-NEXT:    [[ADD10_2]] = fadd fast double [[CONV_2]], [[ADD10_1]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[FOR_COND_FOR_END13_CRIT_EDGE]]:
-; CHECK-NEXT:    [[ADD10_2_LCSSA:%.*]] = phi double [ [[ADD10_2]], %[[FOR_COND1_PREHEADER]] ], [ [[TMP158]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD10_2_LCSSA:%.*]] = phi double [ [[ADD10_2]], %[[FOR_COND1_PREHEADER]] ], [ [[TMP158]], %[[MIDDLE_BLOCK]] ], [ [[TMP186]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[PHITMP:%.*]] = fptrunc double [[ADD10_2_LCSSA]] to float
 ; CHECK-NEXT:    br label %[[FOR_END13]]
 ; CHECK:       [[FOR_END13]]:
@@ -234,5 +385,6 @@ for.end13:                                        ; preds = %for.cond.for.end13_
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
index d2f8f2203b724..5d16ce5346bbf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll
@@ -169,64 +169,140 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; AVX1-NEXT:  entry:
 ; AVX1-NEXT:    [[CMP30:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; AVX1-NEXT:    br i1 [[CMP30]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; AVX1:       for.body.preheader:
+; AVX1:       iter.check:
 ; AVX1-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; AVX1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
+; AVX1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
 ; AVX1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AVX1:       vector.main.loop.iter.check:
+; AVX1-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; AVX1-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
 ; AVX1:       vector.ph:
-; AVX1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
+; AVX1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; AVX1-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
-; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; AVX1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
+; AVX1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 12
 ; AVX1-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[INDEX]], 1
 ; AVX1-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 1
+; AVX1-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP3]], 1
+; AVX1-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP8]], 1
 ; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[S1:%.*]], i64 [[TMP7]]
 ; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP5]]
+; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP6]]
 ; AVX1-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[WIDE_VEC3:%.*]] = load <8 x i16>, ptr [[TMP4]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x i16> [[WIDE_VEC3]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[WIDE_VEC6:%.*]] = load <8 x i16>, ptr [[TMP9]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i16> [[WIDE_VEC6]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[WIDE_VEC9:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <8 x i16> [[WIDE_VEC9]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC11:%.*]] = shufflevector <8 x i16> [[WIDE_VEC9]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[TMP36:%.*]] = sext <4 x i16> [[STRIDED_VEC5]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP37:%.*]] = sext <4 x i16> [[STRIDED_VEC6]] to <4 x i32>
+; AVX1-NEXT:    [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
+; AVX1-NEXT:    [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC12]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[S2:%.*]], i64 [[TMP7]]
 ; AVX1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP5]]
+; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP6]]
 ; AVX1-NEXT:    [[WIDE_VEC13:%.*]] = load <8 x i16>, ptr [[TMP22]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC17:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC21:%.*]] = shufflevector <8 x i16> [[WIDE_VEC13]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[WIDE_VEC14:%.*]] = load <8 x i16>, ptr [[TMP23]], align 2
 ; AVX1-NEXT:    [[STRIDED_VEC18:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; AVX1-NEXT:    [[STRIDED_VEC22:%.*]] = shufflevector <8 x i16> [[WIDE_VEC14]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[WIDE_VEC18:%.*]] = load <8 x i16>, ptr [[TMP17]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC19:%.*]] = shufflevector <8 x i16> [[WIDE_VEC18]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC20:%.*]] = shufflevector <8 x i16> [[WIDE_VEC18]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[WIDE_VEC21:%.*]] = load <8 x i16>, ptr [[TMP18]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC24:%.*]] = shufflevector <8 x i16> [[WIDE_VEC21]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC23:%.*]] = shufflevector <8 x i16> [[WIDE_VEC21]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[TMP40:%.*]] = sext <4 x i16> [[STRIDED_VEC17]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP41:%.*]] = sext <4 x i16> [[STRIDED_VEC18]] to <4 x i32>
+; AVX1-NEXT:    [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32>
+; AVX1-NEXT:    [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC24]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP44:%.*]] = mul nsw <4 x i32> [[TMP40]], [[TMP36]]
 ; AVX1-NEXT:    [[TMP45:%.*]] = mul nsw <4 x i32> [[TMP41]], [[TMP37]]
-; AVX1-NEXT:    [[TMP38:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
-; AVX1-NEXT:    [[TMP39:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
-; AVX1-NEXT:    [[TMP42:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
-; AVX1-NEXT:    [[TMP43:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
 ; AVX1-NEXT:    [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP42]], [[TMP38]]
 ; AVX1-NEXT:    [[TMP47:%.*]] = mul nsw <4 x i32> [[TMP43]], [[TMP39]]
-; AVX1-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[TMP46]], [[TMP44]]
-; AVX1-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP45]]
+; AVX1-NEXT:    [[TMP27:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
+; AVX1-NEXT:    [[TMP28:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
+; AVX1-NEXT:    [[TMP29:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
+; AVX1-NEXT:    [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC11]] to <4 x i32>
+; AVX1-NEXT:    [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
+; AVX1-NEXT:    [[TMP32:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
+; AVX1-NEXT:    [[TMP33:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32>
+; AVX1-NEXT:    [[TMP34:%.*]] = sext <4 x i16> [[STRIDED_VEC23]] to <4 x i32>
+; AVX1-NEXT:    [[TMP35:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP27]]
+; AVX1-NEXT:    [[TMP60:%.*]] = mul nsw <4 x i32> [[TMP32]], [[TMP28]]
+; AVX1-NEXT:    [[TMP67:%.*]] = mul nsw <4 x i32> [[TMP33]], [[TMP29]]
+; AVX1-NEXT:    [[TMP68:%.*]] = mul nsw <4 x i32> [[TMP34]], [[TMP30]]
+; AVX1-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP44]]
+; AVX1-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP60]], [[TMP45]]
+; AVX1-NEXT:    [[TMP69:%.*]] = add nsw <4 x i32> [[TMP67]], [[TMP46]]
+; AVX1-NEXT:    [[TMP70:%.*]] = add nsw <4 x i32> [[TMP68]], [[TMP47]]
 ; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[INDEX]]
 ; AVX1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 4
+; AVX1-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 8
+; AVX1-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 12
 ; AVX1-NEXT:    store <4 x i32> [[TMP19]], ptr [[TMP21]], align 4
 ; AVX1-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP26]], align 4
-; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; AVX1-NEXT:    store <4 x i32> [[TMP69]], ptr [[TMP71]], align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP70]], ptr [[TMP72]], align 4
+; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; AVX1:       scalar.ph:
-; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; AVX1:       vec.epilog.iter.check:
+; AVX1-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; AVX1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
+; AVX1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; AVX1:       vec.epilog.ph:
+; AVX1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; AVX1-NEXT:    [[N_MOD_VF24:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; AVX1-NEXT:    [[N_VEC25:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF24]]
 ; AVX1-NEXT:    br label [[FOR_BODY1:%.*]]
+; AVX1:       vec.epilog.vector.body:
+; AVX1-NEXT:    [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT33:%.*]], [[FOR_BODY1]] ]
+; AVX1-NEXT:    [[TMP48:%.*]] = shl nuw nsw i64 [[INDEX26]], 1
+; AVX1-NEXT:    [[TMP49:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP48]]
+; AVX1-NEXT:    [[WIDE_VEC27:%.*]] = load <8 x i16>, ptr [[TMP49]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC28:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC29:%.*]] = shufflevector <8 x i16> [[WIDE_VEC27]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[TMP50:%.*]] = sext <4 x i16> [[STRIDED_VEC28]] to <4 x i32>
+; AVX1-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i16, ptr [[S2]], i64 [[TMP48]]
+; AVX1-NEXT:    [[WIDE_VEC30:%.*]] = load <8 x i16>, ptr [[TMP51]], align 2
+; AVX1-NEXT:    [[STRIDED_VEC31:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; AVX1-NEXT:    [[STRIDED_VEC32:%.*]] = shufflevector <8 x i16> [[WIDE_VEC30]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; AVX1-NEXT:    [[TMP52:%.*]] = sext <4 x i16> [[STRIDED_VEC31]] to <4 x i32>
+; AVX1-NEXT:    [[TMP53:%.*]] = mul nsw <4 x i32> [[TMP52]], [[TMP50]]
+; AVX1-NEXT:    [[TMP54:%.*]] = sext <4 x i16> [[STRIDED_VEC29]] to <4 x i32>
+; AVX1-NEXT:    [[TMP55:%.*]] = sext <4 x i16> [[STRIDED_VEC32]] to <4 x i32>
+; AVX1-NEXT:    [[TMP56:%.*]] = mul nsw <4 x i32> [[TMP55]], [[TMP54]]
+; AVX1-NEXT:    [[TMP57:%.*]] = add nsw <4 x i32> [[TMP56]], [[TMP53]]
+; AVX1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[INDEX26]]
+; AVX1-NEXT:    store <4 x i32> [[TMP57]], ptr [[TMP58]], align 4
+; AVX1-NEXT:    [[INDEX_NEXT33]] = add nuw i64 [[INDEX26]], 4
+; AVX1-NEXT:    [[TMP59:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC25]]
+; AVX1-NEXT:    br i1 [[TMP59]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
+; AVX1:       vec.epilog.middle.block:
+; AVX1-NEXT:    [[CMP_N34:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC25]]
+; AVX1-NEXT:    br i1 [[CMP_N34]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]]
+; AVX1:       vec.epilog.scalar.ph:
+; AVX1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; AVX1-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX1:       for.body:
-; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY1]] ]
+; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; AVX1-NEXT:    [[TMP61:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[S1]], i64 [[TMP61]]
 ; AVX1-NEXT:    [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
@@ -248,7 +324,7 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon
 ; AVX1-NEXT:    store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]]
+; AVX1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AVX1:       for.end.loopexit:
 ; AVX1-NEXT:    br label [[FOR_END]]
 ; AVX1:       for.end:

>From 9639ac96dda859d06580d0511fe0c40822ebfdd1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 12 Sep 2025 16:50:52 +0100
Subject: [PATCH 2/2] !fixup use none_of, check if DefV is in Ends.

---
 llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 03ca5adf738de..b1a57ffe5b8f1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -527,8 +527,8 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
 
     // Ignore recipes that are never used within the loop and do not have side
     // effects.
-    if (all_of(R->definedValues(),
-               [&Ends](VPValue *Def) { return !Ends.count(Def); }) &&
+    if (none_of(R->definedValues(),
+                [&Ends](VPValue *Def) { return Ends.count(Def); }) &&
         !R->mayHaveSideEffects())
       continue;
 
@@ -596,7 +596,8 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
     // Add the VPValues defined by the current recipe to the list of open
     // intervals.
     for (VPValue *DefV : R->definedValues())
-      OpenIntervals.insert(DefV);
+      if (Ends.contains(DefV))
+        OpenIntervals.insert(DefV);
   }
 
   // We also search for instructions that are defined outside the loop, but are