[llvm] [SLP]Initial support for ordered reductions (PR #182644)

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Sun Mar 8 10:28:58 PDT 2026


https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/182644

>From c9d5e477b4e05d483db4921c5c566aeeb3a82897 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 20 Feb 2026 17:46:54 -0800
Subject: [PATCH 1/4] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.7
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  83 ++++--
 .../SLPVectorizer/AArch64/tsc-s352.ll         |  20 +-
 .../SLPVectorizer/X86/dot-product.ll          |  92 ++++--
 .../Transforms/SLPVectorizer/X86/fmaxnum.ll   | 282 +++++++++++++++---
 .../Transforms/SLPVectorizer/X86/fminnum.ll   | 262 +++++++++++++---
 .../SLPVectorizer/X86/horizontal-list.ll      |  12 +-
 llvm/test/Transforms/SLPVectorizer/X86/phi.ll |  50 ++--
 .../scatter-vectorize-reorder-non-empty.ll    |  17 +-
 8 files changed, 633 insertions(+), 185 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4caa1707f0f27..b1df83e021cb7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -25349,31 +25349,37 @@ class HorizontalReduction {
   }
 
   /// Checks if instruction is associative and can be vectorized.
-  static bool isVectorizable(RecurKind Kind, Instruction *I,
-                             bool TwoElementReduction = false) {
+  enum class ReductionKind {Unordered, Ordered, None};
+  ReductionKind RK = ReductionKind::None;
+  static ReductionKind isVectorizable(RecurKind Kind, Instruction *I,
+                                      bool TwoElementReduction = false) {
     if (Kind == RecurKind::None)
-      return false;
+      return ReductionKind::None;
 
     // Integer ops that map to select instructions or intrinsics are fine.
     if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
         isBoolLogicOp(I))
-      return true;
+      return ReductionKind::Unordered;
 
     // No need to check for associativity, if 2 reduced values.
     if (TwoElementReduction)
-      return true;
+      return ReductionKind::Unordered;
 
     if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
       // FP min/max are associative except for NaN and -0.0. We do not
       // have to rule out -0.0 here because the intrinsic semantics do not
       // specify a fixed result for it.
-      return I->getFastMathFlags().noNaNs();
+      return I->getFastMathFlags().noNaNs() ? ReductionKind::Unordered
+                                            : ReductionKind::Ordered;
     }
 
     if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
-      return true;
+      return ReductionKind::Unordered;
+
+    if (I->isAssociative())
+      return ReductionKind::Unordered;
 
-    return I->isAssociative();
+    return ::isCommutative(I) ? ReductionKind::Ordered : ReductionKind::None;
   }
 
   static Value *getRdxOperand(Instruction *I, unsigned Index) {
@@ -25675,13 +25681,10 @@ class HorizontalReduction {
     // Analyze "regular" integer/FP types for reductions - no target-specific
     // types or pointers.
     assert(ReductionRoot && "Reduction root is not set!");
-    if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
-                        all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
-                          return Ops.size() == 2;
-                        })))
-      return false;
-
-    return true;
+    return isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
+                          all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
+                            return Ops.size() == 2;
+                          })) != ReductionKind::None;
   }
 
   /// Try to find a reduction tree.
@@ -25689,7 +25692,8 @@ class HorizontalReduction {
                                  ScalarEvolution &SE, const DataLayout &DL,
                                  const TargetLibraryInfo &TLI) {
     RdxKind = HorizontalReduction::getRdxKind(Root);
-    if (!isVectorizable(RdxKind, Root))
+    RK = isVectorizable(RdxKind, Root);
+    if (RK == ReductionKind::None)
       return false;
 
     // Analyze "regular" integer/FP types for reductions - no target-specific
@@ -25728,16 +25732,21 @@ class HorizontalReduction {
         // reduction opcode or has too many uses - possible reduced value.
         // Also, do not try to reduce const values, if the operation is not
         // foldable.
-        if (!EdgeInst || Level > RecursionMaxDepth ||
+        bool IsReducedVal = !EdgeInst || Level > RecursionMaxDepth ||
             getRdxKind(EdgeInst) != RdxKind ||
             IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
-            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
-            !isVectorizable(RdxKind, EdgeInst) ||
+            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst);
+        ReductionKind CurrentRK = IsReducedVal
+                                      ? ReductionKind::None
+                                      : isVectorizable(RdxKind, EdgeInst);
+        if (CurrentRK == ReductionKind::None ||
             (R.isAnalyzedReductionRoot(EdgeInst) &&
              all_of(EdgeInst->operands(), IsaPred<Constant>))) {
           PossibleReducedVals.push_back(EdgeVal);
           continue;
         }
+        if (CurrentRK == ReductionKind::Ordered)
+          RK = ReductionKind::Ordered;
         ReductionOps.push_back(EdgeInst);
       }
     };
@@ -25943,6 +25952,10 @@ class HorizontalReduction {
     for (Value *U : IgnoreList)
       if (auto *FPMO = dyn_cast<FPMathOperator>(U))
         RdxFMF &= FPMO->getFastMathFlags();
+    // For ordered reductions here we need to generate extractelement
+    // instructions, so clear IgnoreList.
+    if (RK == ReductionKind::Ordered)
+      IgnoreList.clear();
     bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
 
     // Need to track reduced vals, they may be changed during vectorization of
@@ -26054,6 +26067,8 @@ class HorizontalReduction {
 
       // Emit code for constant values.
       if (Candidates.size() > 1 && allConstant(Candidates)) {
+        if (RK == ReductionKind::Ordered)
+          continue;
         Value *Res = Candidates.front();
         Value *OrigV = TrackedToOrig.at(Candidates.front());
         ++VectorizedVals.try_emplace(OrigV).first->getSecond();
@@ -26075,9 +26090,9 @@ class HorizontalReduction {
 
       // Check if we support repeated scalar values processing (optimization of
       // original scalar identity operations on matched horizontal reductions).
-      IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
-                                    RdxKind != RecurKind::FMul &&
-                                    RdxKind != RecurKind::FMulAdd;
+      IsSupportedHorRdxIdentityOp =
+          RK == ReductionKind::Unordered && RdxKind != RecurKind::Mul &&
+          RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
       // Gather same values.
       SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
       if (IsSupportedHorRdxIdentityOp)
@@ -26346,6 +26361,23 @@ class HorizontalReduction {
         // Vectorize a tree.
         Value *VectorizedRoot = V.vectorizeTree(
             LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
+        if (RK == ReductionKind::Ordered) {
+          // No need to generate reduction here, eit extractelements instead in
+          // the tree vectorizer.
+          assert(VectorizedRoot && "Expected vectorized tree");
+          // Count vectorized reduced values to exclude them from final
+          // reduction.
+          for (Value *RdxVal : VL)
+            ++VectorizedVals.try_emplace(RdxVal).first->getSecond();
+          Pos += ReduxWidth;
+          Start = Pos;
+          ReduxWidth = NumReducedVals - Pos;
+          if (ReduxWidth > 1)
+            ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
+          AnyVectorized = true;
+          VectorizedTree = ReductionRoot;
+          continue;
+        }
         // Update TrackedToOrig mapping, since the tracked values might be
         // updated.
         for (Value *RdxVal : Candidates) {
@@ -26410,6 +26442,11 @@ class HorizontalReduction {
         continue;
       }
     }
+    // Early exit for the ordered reductions.
+    // No need to do anything else here, so we can just exit.
+    if (RK == ReductionKind::Ordered)
+      return VectorizedTree;
+
     if (!VectorValuesAndScales.empty())
       VectorizedTree = GetNewVectorizedTree(
           VectorizedTree,
@@ -27458,7 +27495,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
       continue;
     if (Value *VectorizedV = TryToReduce(Inst)) {
       Res = true;
-      if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
+      if (auto *I = dyn_cast<Instruction>(VectorizedV); I && I != Inst) {
         // Try to find another reduction.
         Stack.emplace(I, Level);
         continue;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
index 70b7adfd6456e..c9a2219b12a8a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll
@@ -31,22 +31,16 @@ define i32 @s352() {
 ; CHECK-NEXT:    [[DOT_115:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[ADD39:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA:%.*]], ptr @global_data, i64 0, i32 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 3, i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x float> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[DOT_115]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[ADD15:%.*]] = fadd float [[ADD]], [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 0, i64 [[TMP7]]
-; CHECK-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 3, i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX18]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[ARRAYIDX21]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x float> [[TMP9]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
 ; CHECK-NEXT:    [[ADD23:%.*]] = fadd float [[ADD15]], [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
 ; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD23]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
 ; CHECK-NEXT:    [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], ptr @global_data, i64 0, i32 0, i64 [[TMP15]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index a333d162297bc..1b9dbaca0a34d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -9,23 +9,62 @@
 ;
 
 define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
-; CHECK-LABEL: @dot4f64(
-; CHECK-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 2
-; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; CHECK-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP10]]
-; CHECK-NEXT:    ret double [[DOT0123]]
+; SSE2-LABEL: @dot4f64(
+; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; SSE2-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
+; SSE2-NEXT:    [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]]
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 2
+; SSE2-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]]
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
+; SSE2-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP7]]
+; SSE2-NEXT:    ret double [[DOT0123]]
+;
+; SSE4-LABEL: @dot4f64(
+; SSE4-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; SSE4-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; SSE4-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
+; SSE4-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
+; SSE4-NEXT:    [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 2
+; SSE4-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]]
+; SSE4-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
+; SSE4-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP7]]
+; SSE4-NEXT:    ret double [[DOT0123]]
+;
+; AVX-LABEL: @dot4f64(
+; AVX-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 2
+; AVX-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 2
+; AVX-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX]], align 4
+; AVX-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY]], align 4
+; AVX-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
+; AVX-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
+; AVX-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP4]], [[TMP5]]
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; AVX-NEXT:    [[DOT01:%.*]] = fadd double [[TMP7]], [[TMP8]]
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+; AVX-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP9]]
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+; AVX-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP10]]
+; AVX-NEXT:    ret double [[DOT0123]]
+;
+; AVX2-LABEL: @dot4f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
+; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
+; AVX2-NEXT:    [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]]
+; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 2
+; AVX2-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]]
+; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
+; AVX2-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP7]]
+; AVX2-NEXT:    ret double [[DOT0123]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
@@ -53,20 +92,15 @@ define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %p
 
 define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 ; CHECK-LABEL: @dot4f32(
-; CHECK-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 2
-; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x float> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
 ; CHECK-NEXT:    [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
 ; CHECK-NEXT:    [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP10]]
 ; CHECK-NEXT:    ret float [[DOT0123]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
index a42567c5e2e46..822f1051d45d6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,COREI7
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,BDVER1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256
 
@@ -99,6 +99,46 @@ define void @fmaxnum_8f64() #0 {
 ; SSE-NEXT:    store <2 x double> [[TMP12]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 6), align 4
 ; SSE-NEXT:    ret void
 ;
+; COREI7-LABEL: @fmaxnum_8f64(
+; COREI7-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @srcA64, align 4
+; COREI7-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr @srcB64, align 4
+; COREI7-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; COREI7-NEXT:    store <4 x double> [[TMP3]], ptr @dst64, align 4
+; COREI7-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @srcA64, i32 0, i64 4), align 4
+; COREI7-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @srcB64, i32 0, i64 4), align 4
+; COREI7-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
+; COREI7-NEXT:    store <4 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 4
+; COREI7-NEXT:    ret void
+;
+; BDVER1-LABEL: @fmaxnum_8f64(
+; BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @srcA64, align 4
+; BDVER1-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr @srcB64, align 4
+; BDVER1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; BDVER1-NEXT:    store <4 x double> [[TMP3]], ptr @dst64, align 4
+; BDVER1-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @srcA64, i32 0, i64 4), align 4
+; BDVER1-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @srcB64, i32 0, i64 4), align 4
+; BDVER1-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
+; BDVER1-NEXT:    store <4 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 4
+; BDVER1-NEXT:    ret void
+;
+; AVX2-LABEL: @fmaxnum_8f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @srcA64, align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr @srcB64, align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; AVX2-NEXT:    store <4 x double> [[TMP3]], ptr @dst64, align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @srcA64, i32 0, i64 4), align 4
+; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @srcB64, i32 0, i64 4), align 4
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
+; AVX2-NEXT:    store <4 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @fmaxnum_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @srcA64, align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x double>, ptr @srcB64, align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.maxnum.v8f64(<8 x double> [[TMP1]], <8 x double> [[TMP2]])
+; AVX512-NEXT:    store <8 x double> [[TMP3]], ptr @dst64, align 4
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @fmaxnum_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @srcA64, align 4
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr @srcB64, align 4
@@ -109,13 +149,6 @@ define void @fmaxnum_8f64() #0 {
 ; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
 ; AVX256-NEXT:    store <4 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 4
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @fmaxnum_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @srcA64, align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x double>, ptr @srcB64, align 4
-; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.maxnum.v8f64(<8 x double> [[TMP1]], <8 x double> [[TMP2]])
-; AVX512-NEXT:    store <8 x double> [[TMP3]], ptr @dst64, align 4
-; AVX512-NEXT:    ret void
 ;
   %a0 = load double, ptr @srcA64, align 4
   %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @srcA64, i32 0, i64 1), align 4
@@ -253,6 +286,46 @@ define void @fmaxnum_16f32() #0 {
 ; SSE-NEXT:    store <4 x float> [[TMP12]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 12), align 4
 ; SSE-NEXT:    ret void
 ;
+; COREI7-LABEL: @fmaxnum_16f32(
+; COREI7-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @srcA32, align 4
+; COREI7-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @srcB32, align 4
+; COREI7-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; COREI7-NEXT:    store <8 x float> [[TMP3]], ptr @dst32, align 4
+; COREI7-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @srcA32, i32 0, i64 8), align 4
+; COREI7-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @srcB32, i32 0, i64 8), align 4
+; COREI7-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
+; COREI7-NEXT:    store <8 x float> [[TMP6]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 4
+; COREI7-NEXT:    ret void
+;
+; BDVER1-LABEL: @fmaxnum_16f32(
+; BDVER1-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @srcA32, align 4
+; BDVER1-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @srcB32, align 4
+; BDVER1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; BDVER1-NEXT:    store <8 x float> [[TMP3]], ptr @dst32, align 4
+; BDVER1-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @srcA32, i32 0, i64 8), align 4
+; BDVER1-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @srcB32, i32 0, i64 8), align 4
+; BDVER1-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
+; BDVER1-NEXT:    store <8 x float> [[TMP6]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 4
+; BDVER1-NEXT:    ret void
+;
+; AVX2-LABEL: @fmaxnum_16f32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @srcA32, align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @srcB32, align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; AVX2-NEXT:    store <8 x float> [[TMP3]], ptr @dst32, align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @srcA32, i32 0, i64 8), align 4
+; AVX2-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @srcB32, i32 0, i64 8), align 4
+; AVX2-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
+; AVX2-NEXT:    store <8 x float> [[TMP6]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @fmaxnum_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr @srcA32, align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x float>, ptr @srcB32, align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.maxnum.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP2]])
+; AVX512-NEXT:    store <16 x float> [[TMP3]], ptr @dst32, align 4
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @fmaxnum_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @srcA32, align 4
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @srcB32, align 4
@@ -263,13 +336,6 @@ define void @fmaxnum_16f32() #0 {
 ; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
 ; AVX256-NEXT:    store <8 x float> [[TMP6]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 4
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @fmaxnum_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr @srcA32, align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x float>, ptr @srcB32, align 4
-; AVX512-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.maxnum.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP2]])
-; AVX512-NEXT:    store <16 x float> [[TMP3]], ptr @dst32, align 4
-; AVX512-NEXT:    ret void
 ;
   %a0  = load float, ptr @srcA32, align 4
   %a1  = load float, ptr getelementptr inbounds ([16 x float], ptr @srcA32, i32 0, i64  1), align 4
@@ -379,19 +445,84 @@ define float @reduction_v4f32_nnan(ptr %p) {
 ; Negative test - must have nnan.
 
 define float @reduction_v4f32_not_fast(ptr %p) {
-; CHECK-LABEL: @reduction_v4f32_not_fast(
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    ret float [[M3]]
+; SSE-LABEL: @reduction_v4f32_not_fast(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP3]], float [[TMP2]])
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP4]], float [[M1]])
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP5]], float [[M2]])
+; SSE-NEXT:    ret float [[M3]]
+;
+; COREI7-LABEL: @reduction_v4f32_not_fast(
+; COREI7-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
+; COREI7-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; COREI7-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; COREI7-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP3]], float [[TMP2]])
+; COREI7-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; COREI7-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP4]], float [[M1]])
+; COREI7-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; COREI7-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP5]], float [[M2]])
+; COREI7-NEXT:    ret float [[M3]]
+;
+; BDVER1-LABEL: @reduction_v4f32_not_fast(
+; BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
+; BDVER1-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; BDVER1-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP3]], float [[TMP2]])
+; BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; BDVER1-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP4]], float [[M1]])
+; BDVER1-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; BDVER1-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP5]], float [[M2]])
+; BDVER1-NEXT:    ret float [[M3]]
 ;
+; AVX2-LABEL: @reduction_v4f32_not_fast(
+; AVX2-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 1
+; AVX2-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
+; AVX2-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
+; AVX2-NEXT:    [[T0:%.*]] = load float, ptr [[P]], align 4
+; AVX2-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
+; AVX2-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
+; AVX2-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
+; AVX2-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[T1]], float [[T0]])
+; AVX2-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[T2]], float [[M1]])
+; AVX2-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[T3]], float [[M2]])
+; AVX2-NEXT:    ret float [[M3]]
+;
+; AVX512-LABEL: @reduction_v4f32_not_fast(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX512-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP3]], float [[TMP2]])
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX512-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP4]], float [[M1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX512-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP5]], float [[M2]])
+; AVX512-NEXT:    ret float [[M3]]
+;
+; AVX256-LABEL: @reduction_v4f32_not_fast(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP3]], float [[TMP2]])
+; AVX256-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP4]], float [[M1]])
+; AVX256-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP5]], float [[M2]])
+; AVX256-NEXT:    ret float [[M3]]
+;
+; PREF-AVX256-LABEL: @reduction_v4f32_not_fast(
+; PREF-AVX256-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
+; PREF-AVX256-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; PREF-AVX256-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; PREF-AVX256-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP3]], float [[TMP2]])
+; PREF-AVX256-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; PREF-AVX256-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP4]], float [[M1]])
+; PREF-AVX256-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; PREF-AVX256-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP5]], float [[M2]])
+; PREF-AVX256-NEXT:    ret float [[M3]]
   %g1 = getelementptr inbounds float, ptr %p, i64 1
   %g2 = getelementptr inbounds float, ptr %p, i64 2
   %g3 = getelementptr inbounds float, ptr %p, i64 3
@@ -473,19 +604,88 @@ define double @reduction_v4f64_fast(ptr %p) {
 ; Negative test - must have nnan.
 
 define double @reduction_v4f64_wrong_fmf(ptr %p) {
-; CHECK-LABEL: @reduction_v4f64_wrong_fmf(
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T1]], double [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T2]], double [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T3]], double [[M2]])
-; CHECK-NEXT:    ret double [[M3]]
+; SSE-LABEL: @reduction_v4f64_wrong_fmf(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP3]], double [[TMP2]])
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; SSE-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP4]], double [[M1]])
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; SSE-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP5]], double [[M2]])
+; SSE-NEXT:    ret double [[M3]]
+;
+; COREI7-LABEL: @reduction_v4f64_wrong_fmf(
+; COREI7-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
+; COREI7-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
+; COREI7-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
+; COREI7-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
+; COREI7-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; COREI7-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; COREI7-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; COREI7-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T1]], double [[T0]])
+; COREI7-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T2]], double [[M1]])
+; COREI7-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T3]], double [[M2]])
+; COREI7-NEXT:    ret double [[M3]]
+;
+; BDVER1-LABEL: @reduction_v4f64_wrong_fmf(
+; BDVER1-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
+; BDVER1-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
+; BDVER1-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
+; BDVER1-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
+; BDVER1-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; BDVER1-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; BDVER1-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; BDVER1-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T1]], double [[T0]])
+; BDVER1-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T2]], double [[M1]])
+; BDVER1-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T3]], double [[M2]])
+; BDVER1-NEXT:    ret double [[M3]]
+;
+; AVX2-LABEL: @reduction_v4f64_wrong_fmf(
+; AVX2-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
+; AVX2-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
+; AVX2-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
+; AVX2-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
+; AVX2-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; AVX2-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; AVX2-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; AVX2-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T1]], double [[T0]])
+; AVX2-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T2]], double [[M1]])
+; AVX2-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T3]], double [[M2]])
+; AVX2-NEXT:    ret double [[M3]]
+;
+; AVX512-LABEL: @reduction_v4f64_wrong_fmf(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; AVX512-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP3]], double [[TMP2]])
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; AVX512-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP4]], double [[M1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; AVX512-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP5]], double [[M2]])
+; AVX512-NEXT:    ret double [[M3]]
+;
+; AVX256-LABEL: @reduction_v4f64_wrong_fmf(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; AVX256-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; AVX256-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP3]], double [[TMP2]])
+; AVX256-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; AVX256-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP4]], double [[M1]])
+; AVX256-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; AVX256-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP5]], double [[M2]])
+; AVX256-NEXT:    ret double [[M3]]
 ;
+; PREF-AVX256-LABEL: @reduction_v4f64_wrong_fmf(
+; PREF-AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
+; PREF-AVX256-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; PREF-AVX256-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; PREF-AVX256-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP3]], double [[TMP2]])
+; PREF-AVX256-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; PREF-AVX256-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP4]], double [[M1]])
+; PREF-AVX256-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; PREF-AVX256-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP5]], double [[M2]])
+; PREF-AVX256-NEXT:    ret double [[M3]]
   %g1 = getelementptr inbounds double, ptr %p, i64 1
   %g2 = getelementptr inbounds double, ptr %p, i64 2
   %g3 = getelementptr inbounds double, ptr %p, i64 3
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
index 434fa13e880bb..5eb2328a3dc8f 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,COREI7
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver1 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,BDVER1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256
 
@@ -99,6 +99,46 @@ define void @fminnum_8f64() #0 {
 ; SSE-NEXT:    store <2 x double> [[TMP12]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 6), align 4
 ; SSE-NEXT:    ret void
 ;
+; COREI7-LABEL: @fminnum_8f64(
+; COREI7-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @srcA64, align 4
+; COREI7-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr @srcB64, align 4
+; COREI7-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; COREI7-NEXT:    store <4 x double> [[TMP3]], ptr @dst64, align 4
+; COREI7-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @srcA64, i32 0, i64 4), align 4
+; COREI7-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @srcB64, i32 0, i64 4), align 4
+; COREI7-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
+; COREI7-NEXT:    store <4 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 4
+; COREI7-NEXT:    ret void
+;
+; BDVER1-LABEL: @fminnum_8f64(
+; BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @srcA64, align 4
+; BDVER1-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr @srcB64, align 4
+; BDVER1-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; BDVER1-NEXT:    store <4 x double> [[TMP3]], ptr @dst64, align 4
+; BDVER1-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @srcA64, i32 0, i64 4), align 4
+; BDVER1-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @srcB64, i32 0, i64 4), align 4
+; BDVER1-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
+; BDVER1-NEXT:    store <4 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 4
+; BDVER1-NEXT:    ret void
+;
+; AVX2-LABEL: @fminnum_8f64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @srcA64, align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr @srcB64, align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]])
+; AVX2-NEXT:    store <4 x double> [[TMP3]], ptr @dst64, align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @srcA64, i32 0, i64 4), align 4
+; AVX2-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr getelementptr inbounds ([8 x double], ptr @srcB64, i32 0, i64 4), align 4
+; AVX2-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
+; AVX2-NEXT:    store <4 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @fminnum_8f64(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @srcA64, align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x double>, ptr @srcB64, align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.minnum.v8f64(<8 x double> [[TMP1]], <8 x double> [[TMP2]])
+; AVX512-NEXT:    store <8 x double> [[TMP3]], ptr @dst64, align 4
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @fminnum_8f64(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr @srcA64, align 4
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr @srcB64, align 4
@@ -109,13 +149,6 @@ define void @fminnum_8f64() #0 {
 ; AVX256-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]])
 ; AVX256-NEXT:    store <4 x double> [[TMP6]], ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 4), align 4
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @fminnum_8f64(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr @srcA64, align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = load <8 x double>, ptr @srcB64, align 4
-; AVX512-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.minnum.v8f64(<8 x double> [[TMP1]], <8 x double> [[TMP2]])
-; AVX512-NEXT:    store <8 x double> [[TMP3]], ptr @dst64, align 4
-; AVX512-NEXT:    ret void
 ;
   %a0 = load double, ptr @srcA64, align 4
   %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @srcA64, i32 0, i64 1), align 4
@@ -253,6 +286,46 @@ define void @fminnum_16f32() #0 {
 ; SSE-NEXT:    store <4 x float> [[TMP12]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 12), align 4
 ; SSE-NEXT:    ret void
 ;
+; COREI7-LABEL: @fminnum_16f32(
+; COREI7-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @srcA32, align 4
+; COREI7-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @srcB32, align 4
+; COREI7-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; COREI7-NEXT:    store <8 x float> [[TMP3]], ptr @dst32, align 4
+; COREI7-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @srcA32, i32 0, i64 8), align 4
+; COREI7-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @srcB32, i32 0, i64 8), align 4
+; COREI7-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
+; COREI7-NEXT:    store <8 x float> [[TMP6]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 4
+; COREI7-NEXT:    ret void
+;
+; BDVER1-LABEL: @fminnum_16f32(
+; BDVER1-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @srcA32, align 4
+; BDVER1-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @srcB32, align 4
+; BDVER1-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; BDVER1-NEXT:    store <8 x float> [[TMP3]], ptr @dst32, align 4
+; BDVER1-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @srcA32, i32 0, i64 8), align 4
+; BDVER1-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @srcB32, i32 0, i64 8), align 4
+; BDVER1-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
+; BDVER1-NEXT:    store <8 x float> [[TMP6]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 4
+; BDVER1-NEXT:    ret void
+;
+; AVX2-LABEL: @fminnum_16f32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @srcA32, align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @srcB32, align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]])
+; AVX2-NEXT:    store <8 x float> [[TMP3]], ptr @dst32, align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @srcA32, i32 0, i64 8), align 4
+; AVX2-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr getelementptr inbounds ([16 x float], ptr @srcB32, i32 0, i64 8), align 4
+; AVX2-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
+; AVX2-NEXT:    store <8 x float> [[TMP6]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @fminnum_16f32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr @srcA32, align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x float>, ptr @srcB32, align 4
+; AVX512-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.minnum.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP2]])
+; AVX512-NEXT:    store <16 x float> [[TMP3]], ptr @dst32, align 4
+; AVX512-NEXT:    ret void
+;
 ; AVX256-LABEL: @fminnum_16f32(
 ; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr @srcA32, align 4
 ; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr @srcB32, align 4
@@ -263,13 +336,6 @@ define void @fminnum_16f32() #0 {
 ; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]])
 ; AVX256-NEXT:    store <8 x float> [[TMP6]], ptr getelementptr inbounds ([16 x float], ptr @dst32, i32 0, i64 8), align 4
 ; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @fminnum_16f32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr @srcA32, align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = load <16 x float>, ptr @srcB32, align 4
-; AVX512-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.minnum.v16f32(<16 x float> [[TMP1]], <16 x float> [[TMP2]])
-; AVX512-NEXT:    store <16 x float> [[TMP3]], ptr @dst32, align 4
-; AVX512-NEXT:    ret void
 ;
   %a0  = load float, ptr @srcA32, align 4
   %a1  = load float, ptr getelementptr inbounds ([16 x float], ptr @srcA32, i32 0, i64  1), align 4
@@ -379,18 +445,73 @@ define float @reduction_v4f32_nnan(ptr %p) {
 ; Negative test - must have nnan.
 
 define float @reduction_v4f32_wrong_fmf(ptr %p) {
-; CHECK-LABEL: @reduction_v4f32_wrong_fmf(
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    ret float [[M3]]
+; SSE-LABEL: @reduction_v4f32_wrong_fmf(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SSE-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP4]], float [[M1]])
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; SSE-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP5]], float [[M2]])
+; SSE-NEXT:    ret float [[M3]]
+;
+; COREI7-LABEL: @reduction_v4f32_wrong_fmf(
+; COREI7-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
+; COREI7-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; COREI7-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; COREI7-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
+; COREI7-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; COREI7-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP4]], float [[M1]])
+; COREI7-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; COREI7-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP5]], float [[M2]])
+; COREI7-NEXT:    ret float [[M3]]
+;
+; BDVER1-LABEL: @reduction_v4f32_wrong_fmf(
+; BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
+; BDVER1-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; BDVER1-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
+; BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; BDVER1-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP4]], float [[M1]])
+; BDVER1-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; BDVER1-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP5]], float [[M2]])
+; BDVER1-NEXT:    ret float [[M3]]
+;
+; AVX2-LABEL: @reduction_v4f32_wrong_fmf(
+; AVX2-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 1
+; AVX2-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
+; AVX2-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
+; AVX2-NEXT:    [[T0:%.*]] = load float, ptr [[P]], align 4
+; AVX2-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
+; AVX2-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
+; AVX2-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
+; AVX2-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[T1]], float [[T0]])
+; AVX2-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[T2]], float [[M1]])
+; AVX2-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[T3]], float [[M2]])
+; AVX2-NEXT:    ret float [[M3]]
+;
+; AVX512-LABEL: @reduction_v4f32_wrong_fmf(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX512-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX512-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP4]], float [[M1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX512-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP5]], float [[M2]])
+; AVX512-NEXT:    ret float [[M3]]
+;
+; AVX256-LABEL: @reduction_v4f32_wrong_fmf(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; AVX256-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; AVX256-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
+; AVX256-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX256-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP4]], float [[M1]])
+; AVX256-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+; AVX256-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP5]], float [[M2]])
+; AVX256-NEXT:    ret float [[M3]]
 ;
   %g1 = getelementptr inbounds float, ptr %p, i64 1
   %g2 = getelementptr inbounds float, ptr %p, i64 2
@@ -473,18 +594,77 @@ define double @reduction_v4f64_fast(ptr %p) {
 ; Negative test - must have nnan.
 
 define double @reduction_v4f64_not_fast(ptr %p) {
-; CHECK-LABEL: @reduction_v4f64_not_fast(
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[T1]], double [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[T2]], double [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[T3]], double [[M2]])
-; CHECK-NEXT:    ret double [[M3]]
+; SSE-LABEL: @reduction_v4f64_not_fast(
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
+; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; SSE-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[TMP3]], double [[TMP2]])
+; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; SSE-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[TMP4]], double [[M1]])
+; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; SSE-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[TMP5]], double [[M2]])
+; SSE-NEXT:    ret double [[M3]]
+;
+; COREI7-LABEL: @reduction_v4f64_not_fast(
+; COREI7-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
+; COREI7-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
+; COREI7-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
+; COREI7-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
+; COREI7-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; COREI7-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; COREI7-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; COREI7-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[T1]], double [[T0]])
+; COREI7-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[T2]], double [[M1]])
+; COREI7-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[T3]], double [[M2]])
+; COREI7-NEXT:    ret double [[M3]]
+;
+; BDVER1-LABEL: @reduction_v4f64_not_fast(
+; BDVER1-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
+; BDVER1-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
+; BDVER1-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
+; BDVER1-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
+; BDVER1-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; BDVER1-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; BDVER1-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; BDVER1-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[T1]], double [[T0]])
+; BDVER1-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[T2]], double [[M1]])
+; BDVER1-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[T3]], double [[M2]])
+; BDVER1-NEXT:    ret double [[M3]]
+;
+; AVX2-LABEL: @reduction_v4f64_not_fast(
+; AVX2-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
+; AVX2-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
+; AVX2-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
+; AVX2-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
+; AVX2-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; AVX2-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; AVX2-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; AVX2-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[T1]], double [[T0]])
+; AVX2-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[T2]], double [[M1]])
+; AVX2-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[T3]], double [[M2]])
+; AVX2-NEXT:    ret double [[M3]]
+;
+; AVX512-LABEL: @reduction_v4f64_not_fast(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; AVX512-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[TMP3]], double [[TMP2]])
+; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; AVX512-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[TMP4]], double [[M1]])
+; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; AVX512-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[TMP5]], double [[M2]])
+; AVX512-NEXT:    ret double [[M3]]
+;
+; AVX256-LABEL: @reduction_v4f64_not_fast(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+; AVX256-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
+; AVX256-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[TMP3]], double [[TMP2]])
+; AVX256-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
+; AVX256-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[TMP4]], double [[M1]])
+; AVX256-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
+; AVX256-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[TMP5]], double [[M2]])
+; AVX256-NEXT:    ret double [[M3]]
 ;
   %g1 = getelementptr inbounds double, ptr %p, i64 1
   %g2 = getelementptr inbounds double, ptr %p, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 5cdbedb7c6dad..4e434a61e1f1c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -914,16 +914,14 @@ define float @extra_args_no_fast(ptr %x, float %a, float %b) {
 ; THRESHOLD-LABEL: @extra_args_no_fast(
 ; THRESHOLD-NEXT:    [[ADDC:%.*]] = fadd fast float [[B:%.*]], 3.000000e+00
 ; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[A:%.*]], [[ADDC]]
-; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2
-; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 3
-; THRESHOLD-NEXT:    [[T0:%.*]] = load float, ptr [[X]], align 4
-; THRESHOLD-NEXT:    [[T1:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
-; THRESHOLD-NEXT:    [[T2:%.*]] = load float, ptr [[ARRAYIDX3_1]], align 4
-; THRESHOLD-NEXT:    [[T3:%.*]] = load float, ptr [[ARRAYIDX3_2]], align 4
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[X:%.*]], align 4
+; THRESHOLD-NEXT:    [[T0:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
 ; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float [[T0]], [[ADD]]
+; THRESHOLD-NEXT:    [[T1:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
 ; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float [[T1]], [[ADD1]]
+; THRESHOLD-NEXT:    [[T2:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
 ; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd float [[T2]], [[ADD4]]
+; THRESHOLD-NEXT:    [[T3:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
 ; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float [[T3]], [[ADD4_1]]
 ; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4_2]], [[A]]
 ; THRESHOLD-NEXT:    ret float [[ADD5]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index 17ae33652b6d8..caae1e3dc7da8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -136,45 +136,47 @@ for.end:                                          ; preds = %for.body
 define float @foo3(ptr nocapture readonly %A) #0 {
 ; CHECK-LABEL: @foo3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
-; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
 ; CHECK-NEXT:    [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 5, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i32 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <4 x float> [[TMP13]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP12]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 4>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x float> [[TMP21]], <4 x float> [[TMP13]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <4 x float> [[TMP22]], <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00, float 1.000000e+01>
 ; CHECK-NEXT:    [[TMP15]] = fadd <4 x float> [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[MUL25:%.*]] = fmul float [[TMP8]], 1.100000e+01
+; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL25]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP16]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP15]], i32 0
-; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP15]], i32 1
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP18]]
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[TMP17]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP15]], i32 2
 ; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP15]], i32 3
 ; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP20]]
-; CHECK-NEXT:    ret float [[ADD31]]
+; CHECK-NEXT:    [[ADD32:%.*]] = fadd float [[ADD31]], [[ADD6]]
+; CHECK-NEXT:    ret float [[ADD32]]
 ;
 entry:
   %0 = load float, ptr %A, align 4
@@ -237,18 +239,18 @@ define float @sort_phi_type(ptr nocapture readonly %A) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x float> [ splat (float 1.000000e+01), [[ENTRY]] ], [ [[TMP2:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP2]] = fmul <4 x float> [[TMP1]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
 ; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
 ; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP6]]
 ; CHECK-NEXT:    ret float [[ADD31]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll
index 94172cffb0295..80bd8ae07e2e2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll
@@ -7,13 +7,16 @@ define double @test01() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr null, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <2 x ptr> zeroinitializer, <2 x i32> [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 [[TMP2]], <2 x i1> splat (i1 true), <2 x double> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> <double 0.000000e+00, double poison>, <2 x i32> <i32 2, i32 0>
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP3]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd double [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    ret double [[TMP9]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd double [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd double [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd double 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd double [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd double [[TMP11]], [[TMP9]]
+; CHECK-NEXT:    ret double [[TMP12]]
 ;
   %1 = load i32, ptr null, align 8
   %2 = load i32, ptr getelementptr inbounds (i32, ptr null, i32 1), align 4

>From c3ddc3f74593c6cd31043c976583f7557edba661 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 20 Feb 2026 17:51:09 -0800
Subject: [PATCH 2/4] Fix formatting

Created using spr 1.3.7
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b1df83e021cb7..044d0d4579c04 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -25349,7 +25349,7 @@ class HorizontalReduction {
   }
 
   /// Checks if instruction is associative and can be vectorized.
-  enum class ReductionKind {Unordered, Ordered, None};
+  enum class ReductionKind { Unordered, Ordered, None };
   ReductionKind RK = ReductionKind::None;
   static ReductionKind isVectorizable(RecurKind Kind, Instruction *I,
                                       bool TwoElementReduction = false) {
@@ -25733,9 +25733,9 @@ class HorizontalReduction {
         // Also, do not try to reduce const values, if the operation is not
         // foldable.
         bool IsReducedVal = !EdgeInst || Level > RecursionMaxDepth ||
-            getRdxKind(EdgeInst) != RdxKind ||
-            IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
-            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst);
+                            getRdxKind(EdgeInst) != RdxKind ||
+                            IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
+                            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst);
         ReductionKind CurrentRK = IsReducedVal
                                       ? ReductionKind::None
                                       : isVectorizable(RdxKind, EdgeInst);

>From 563fabb3d279d71c6dd97d76699360bd9a8dabbe Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Sat, 21 Feb 2026 14:51:54 -0800
Subject: [PATCH 3/4] Allow multiuses for ordered reductions, fix cost modeling

Created using spr 1.3.7
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  81 ++++++--
 .../SLPVectorizer/RISCV/complex-loads.ll      | 190 ++++++++---------
 .../SLPVectorizer/X86/dot-product.ll          |  68 ++----
 .../Transforms/SLPVectorizer/X86/fmaxnum.ll   | 162 +++------------
 .../Transforms/SLPVectorizer/X86/fminnum.ll   | 162 +++------------
 .../SLPVectorizer/X86/horizontal-minmax.ll    |  10 +-
 .../Transforms/SLPVectorizer/X86/lookahead.ll |  17 +-
 .../X86/parent-node-split-non-schedulable.ll  |   1 -
 llvm/test/Transforms/SLPVectorizer/X86/phi.ll |  51 +++--
 .../scatter-vectorize-reorder-non-empty.ll    |  15 +-
 .../SLPVectorizer/X86/used-reduced-op.ll      | 195 +++++-------------
 .../buildvector-nodes-dependency.ll           | 110 ++++++----
 ...nsert-element-build-vector-inseltpoison.ll |   4 -
 .../insert-element-build-vector.ll            |   4 -
 14 files changed, 388 insertions(+), 682 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 044d0d4579c04..ce3005795668e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -16375,6 +16375,11 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
     return true;
   }
 
+  if (VectorizableTree.size() == 1 && !ForReduction &&
+      VectorizableTree.front()->isGather() &&
+      VectorizableTree.front()->hasState() &&
+      VectorizableTree.front()->getOpcode() == Instruction::ExtractElement)
+    return true;
   // No need to vectorize inserts of gathered values.
   if (VectorizableTree.size() == 2 &&
       isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
@@ -25677,14 +25682,15 @@ class HorizontalReduction {
       ReducedValsToOps[V].push_back(I);
   }
 
-  bool matchReductionForOperands() const {
+  bool matchReductionForOperands() {
     // Analyze "regular" integer/FP types for reductions - no target-specific
     // types or pointers.
     assert(ReductionRoot && "Reduction root is not set!");
-    return isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
-                          all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
-                            return Ops.size() == 2;
-                          })) != ReductionKind::None;
+    RK = isVectorizable(RdxKind, cast<Instruction>(ReductionRoot),
+                        all_of(ReducedVals, [](ArrayRef<Value *> Ops) {
+                          return Ops.size() == 2;
+                        }));
+    return RK != ReductionKind::None;
   }
 
   /// Try to find a reduction tree.
@@ -25706,7 +25712,7 @@ class HorizontalReduction {
     // have only single use.
     if (auto *Sel = dyn_cast<SelectInst>(Root))
       if (!Sel->getCondition()->hasOneUse())
-        return false;
+        RK = ReductionKind::Ordered;
 
     ReductionRoot = Root;
 
@@ -25716,6 +25722,7 @@ class HorizontalReduction {
     bool IsCmpSelMinMax = isCmpSelMinMax(Root);
     SmallVector<std::pair<Instruction *, unsigned>> Worklist(
         1, std::make_pair(Root, 0));
+    SmallVector<std::pair<Instruction *, unsigned>> PossibleOrderedReductionOps;
     // Checks if the operands of the \p TreeN instruction are also reduction
     // operations or should be treated as reduced values or an extra argument,
     // which is not part of the reduction.
@@ -25734,11 +25741,18 @@ class HorizontalReduction {
         // foldable.
         bool IsReducedVal = !EdgeInst || Level > RecursionMaxDepth ||
                             getRdxKind(EdgeInst) != RdxKind ||
-                            IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
-                            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst);
+                            IsCmpSelMinMax != isCmpSelMinMax(EdgeInst);
         ReductionKind CurrentRK = IsReducedVal
                                       ? ReductionKind::None
                                       : isVectorizable(RdxKind, EdgeInst);
+        if (!IsReducedVal && CurrentRK == ReductionKind::Unordered &&
+            RK == ReductionKind::Unordered &&
+            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst)) {
+          IsReducedVal = true;
+          CurrentRK = ReductionKind::None;
+          if (PossibleReducedVals.size() < ReductionLimit)
+            PossibleOrderedReductionOps.emplace_back(EdgeInst, Level);
+        }
         if (CurrentRK == ReductionKind::None ||
             (R.isAnalyzedReductionRoot(EdgeInst) &&
              all_of(EdgeInst->operands(), IsaPred<Constant>))) {
@@ -25795,22 +25809,43 @@ class HorizontalReduction {
       return hash_value(LI->getPointerOperand());
     };
 
+    SmallVector<Value *> ReducedValsCandidates;
     while (!Worklist.empty()) {
       auto [TreeN, Level] = Worklist.pop_back_val();
       SmallVector<Value *> PossibleRedVals;
       SmallVector<Instruction *> PossibleReductionOps;
       CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
       addReductionOps(TreeN);
-      // Add reduction values. The values are sorted for better vectorization
-      // results.
-      for (Value *V : PossibleRedVals) {
-        size_t Key, Idx;
-        std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
-                                               /*AllowAlternate=*/false);
-        ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
-      }
+      ReducedValsCandidates.append(PossibleRedVals.begin(),
+                                   PossibleRedVals.end());
       for (Instruction *I : reverse(PossibleReductionOps))
         Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
+      // If not enough elements for unordered vectorization, check if there are
+      // potential candidates for the ordered vectorization and try to add them
+      // to the worklist.
+      if (Worklist.empty() && ReducedValsCandidates.size() < ReductionLimit &&
+          !PossibleOrderedReductionOps.empty() &&
+          RK == ReductionKind::Unordered) {
+        RK = ReductionKind::Ordered;
+        SmallPtrSet<const Instruction *, 4> Ops;
+        for (const auto &P : PossibleOrderedReductionOps)
+          Ops.insert(P.first);
+        erase_if(ReducedValsCandidates, [&](Value *V) {
+          auto *I = dyn_cast<Instruction>(V);
+          return I && Ops.contains(I);
+        });
+        Worklist.append(PossibleOrderedReductionOps.begin(),
+                        PossibleOrderedReductionOps.end());
+        PossibleOrderedReductionOps.clear();
+      }
+    }
+    // Add reduction values. The values are sorted for better vectorization
+    // results.
+    for (Value *V : ReducedValsCandidates) {
+      size_t Key, Idx;
+      std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
+                                             /*AllowAlternate=*/false);
+      ++PossibleReducedVals[Key][Idx].try_emplace(V, 0).first->second;
     }
     auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
     // Sort values by the total number of values kinds to start the reduction
@@ -26177,7 +26212,8 @@ class HorizontalReduction {
       unsigned PrevReduxWidth = ReduxWidth;
       bool CheckForReusedReductionOpsLocal = false;
       auto AdjustReducedVals = [&](bool IgnoreVL = false) {
-        bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
+        bool IsAnyRedOpGathered = !IgnoreVL && (RK == ReductionKind::Ordered ||
+                                                V.isAnyGathered(IgnoreList));
         if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
           // Check if any of the reduction ops are gathered. If so, worth
           // trying again with less number of reduction ops.
@@ -26225,7 +26261,8 @@ class HorizontalReduction {
             }))
           break;
         V.buildTree(VL, IgnoreList);
-        if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
+        if (V.isTreeTinyAndNotFullyVectorizable(RK ==
+                                                ReductionKind::Unordered)) {
           if (!AdjustReducedVals())
             V.analyzedReductionVals(VL);
           continue;
@@ -26234,7 +26271,8 @@ class HorizontalReduction {
         // No need to reorder the root node at all for reassociative reduction.
         V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
                              VL.front()->getType()->isIntOrIntVectorTy() ||
-                             ReductionLimit > 2);
+                             ReductionLimit > 2 ||
+                             RK == ReductionKind::Ordered);
         // Keep extracted other reduction values, if they are used in the
         // vectorization trees.
         BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
@@ -26299,7 +26337,8 @@ class HorizontalReduction {
 
         // Estimate cost.
         InstructionCost ReductionCost;
-        if (V.isReducedBitcastRoot() || V.isReducedCmpBitcastRoot())
+        if (RK == ReductionKind::Ordered || V.isReducedBitcastRoot() ||
+            V.isReducedCmpBitcastRoot())
           ReductionCost = 0;
         else
           ReductionCost =
@@ -26362,7 +26401,7 @@ class HorizontalReduction {
         Value *VectorizedRoot = V.vectorizeTree(
             LocalExternallyUsedValues, InsertPt, VectorValuesAndScales);
         if (RK == ReductionKind::Ordered) {
-          // No need to generate reduction here, eit extractelements instead in
+          // No need to generate reduction here, emit extractelements instead in
           // the tree vectorizer.
           assert(VectorizedRoot && "Expected vectorized tree");
           // Count vectorized reduced values to exclude them from final
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index de9ec44bcf3d2..c41697cd3d114 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -339,7 +339,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX34]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX22]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX10]], align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP5:%.*]] = load i8, ptr [[PIX2]], align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP60:%.*]] = load i8, ptr [[PIX2]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP6:%.*]] = load i8, ptr [[ARRAYIDX37]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX25]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX13]], align 1
@@ -349,9 +349,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX15]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX5]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX20_1]], align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX8_1]], align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP54:%.*]] = load i8, ptr [[ARRAYIDX20_1]], align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP55:%.*]] = load i8, ptr [[ARRAYIDX8_1]], align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP56:%.*]] = load i8, ptr [[ADD_PTR3]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX34_1]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP20:%.*]] = load i8, ptr [[ARRAYIDX10_1]], align 1
@@ -360,7 +360,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX25_1]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP24:%.*]] = load i8, ptr [[ARRAYIDX13_1]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX3_1]], align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP26:%.*]] = load i8, ptr [[ARRAYIDX39_1]], align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP49:%.*]] = load i8, ptr [[ARRAYIDX39_1]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP27:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX15_1]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX5_1]], align 1
@@ -383,18 +383,18 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX32_3]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP47:%.*]] = load i8, ptr [[ARRAYIDX20_3]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP48:%.*]] = load i8, ptr [[ARRAYIDX8_3]], align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP49:%.*]] = load i8, ptr null, align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV_1:%.*]] = zext i8 [[TMP17]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP49]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP57:%.*]] = load i8, ptr null, align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV11_3:%.*]] = zext i8 [[TMP56]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV_3:%.*]] = zext i8 [[TMP57]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV_2:%.*]] = zext i8 [[TMP33]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV9_1:%.*]] = zext i8 [[TMP16]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV23_3:%.*]] = zext i8 [[TMP55]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV9_3:%.*]] = zext i8 [[TMP48]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV9_2:%.*]] = zext i8 [[TMP32]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV21_1:%.*]] = zext i8 [[TMP15]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV35_3:%.*]] = zext i8 [[TMP54]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV21_3:%.*]] = zext i8 [[TMP47]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV21_2:%.*]] = zext i8 [[TMP31]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP50:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP51:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP26:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP52:%.*]] = load i8, ptr [[ARRAYIDX20]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP53:%.*]] = load i8, ptr [[ARRAYIDX8]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV33:%.*]] = zext i8 [[TMP50]] to i32
@@ -403,49 +403,49 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV33_1:%.*]] = zext i8 [[TMP14]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV33_3:%.*]] = zext i8 [[TMP46]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV33_2:%.*]] = zext i8 [[TMP30]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP54:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP55:%.*]] = load i8, ptr [[ARRAYIDX22_3]], align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP56:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP57:%.*]] = load i8, ptr null, align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP5]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP62:%.*]] = load i8, ptr [[ARRAYIDX34_3]], align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP103:%.*]] = load i8, ptr [[ARRAYIDX22_3]], align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP104:%.*]] = load i8, ptr [[ARRAYIDX10_3]], align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP61:%.*]] = load i8, ptr null, align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV4_3:%.*]] = zext i8 [[TMP60]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV2_1:%.*]] = zext i8 [[TMP21]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV2_3:%.*]] = zext i8 [[TMP57]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV2_3:%.*]] = zext i8 [[TMP61]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV2_2:%.*]] = zext i8 [[TMP37]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV11:%.*]] = zext i8 [[TMP4]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV11_1:%.*]] = zext i8 [[TMP20]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV11_3:%.*]] = zext i8 [[TMP56]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV11_4:%.*]] = zext i8 [[TMP104]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV11_2:%.*]] = zext i8 [[TMP36]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV23:%.*]] = zext i8 [[TMP3]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV23_1:%.*]] = zext i8 [[TMP19]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV23_3:%.*]] = zext i8 [[TMP55]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV23_4:%.*]] = zext i8 [[TMP103]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV23_2:%.*]] = zext i8 [[TMP35]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV35:%.*]] = zext i8 [[TMP2]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV35_1:%.*]] = zext i8 [[TMP18]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV35_3:%.*]] = zext i8 [[TMP54]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV35_4:%.*]] = zext i8 [[TMP62]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV35_2:%.*]] = zext i8 [[TMP34]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP58:%.*]] = extractelement <4 x i8> [[TMP51]], i32 0
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP58:%.*]] = extractelement <4 x i8> [[TMP26]], i32 0
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP59:%.*]] = zext i8 [[TMP58]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB36_3:%.*]] = sub i32 [[TMP59]], [[CONV2]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB_1:%.*]] = sub i32 [[CONV_1]], [[CONV2_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB_3:%.*]] = sub i32 [[CONV_3]], [[CONV2_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB_2:%.*]] = sub i32 [[CONV_2]], [[CONV2_2]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB12:%.*]] = sub i32 [[CONV9]], [[CONV11]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB12_1:%.*]] = sub i32 [[CONV9_1]], [[CONV11_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB12_3:%.*]] = sub i32 [[CONV9_3]], [[CONV11_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB12_2:%.*]] = sub i32 [[CONV9_2]], [[CONV11_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB_3:%.*]] = sub i32 [[TMP59]], [[CONV4_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB_2:%.*]] = sub i32 [[CONV11_3]], [[CONV2_1]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB12_3:%.*]] = sub i32 [[CONV_3]], [[CONV2_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB12_2:%.*]] = sub i32 [[CONV_2]], [[CONV2_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB24_3:%.*]] = sub i32 [[CONV9]], [[CONV11]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB24_2:%.*]] = sub i32 [[CONV23_3]], [[CONV11_1]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB36_4:%.*]] = sub i32 [[CONV9_3]], [[CONV11_4]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB36_2:%.*]] = sub i32 [[CONV9_2]], [[CONV11_2]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[SUB24:%.*]] = sub i32 [[CONV21]], [[CONV23]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB24_1:%.*]] = sub i32 [[CONV21_1]], [[CONV23_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB24_3:%.*]] = sub i32 [[CONV21_3]], [[CONV23_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB24_2:%.*]] = sub i32 [[CONV21_2]], [[CONV23_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB24_1:%.*]] = sub i32 [[CONV35_3]], [[CONV23_1]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB24_4:%.*]] = sub i32 [[CONV21_3]], [[CONV23_4]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB24_5:%.*]] = sub i32 [[CONV21_2]], [[CONV23_2]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[SUB36:%.*]] = sub i32 [[CONV33]], [[CONV35]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[SUB36_1:%.*]] = sub i32 [[CONV33_1]], [[CONV35_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB36_4:%.*]] = sub i32 [[CONV33_3]], [[CONV35_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB36_2:%.*]] = sub i32 [[CONV33_2]], [[CONV35_2]]
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP60:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP61:%.*]] = load i8, ptr null, align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV38_3:%.*]] = zext i8 [[TMP9]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB36_3:%.*]] = sub i32 [[CONV33_3]], [[CONV35_4]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB36_5:%.*]] = sub i32 [[CONV33_2]], [[CONV35_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP105:%.*]] = load i8, ptr [[ARRAYIDX5_3]], align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP106:%.*]] = load i8, ptr null, align 1
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV4:%.*]] = zext i8 [[TMP9]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV4_1:%.*]] = zext i8 [[TMP25]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV4_3:%.*]] = zext i8 [[TMP60]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV4_4:%.*]] = zext i8 [[TMP105]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV4_2:%.*]] = zext i8 [[TMP41]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV14:%.*]] = zext i8 [[TMP8]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV14_1:%.*]] = zext i8 [[TMP24]] to i32
@@ -457,13 +457,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV26_2:%.*]] = zext i8 [[TMP39]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV38:%.*]] = zext i8 [[TMP6]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV38_1:%.*]] = zext i8 [[TMP22]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV38_4:%.*]] = zext i8 [[TMP61]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV38_3:%.*]] = zext i8 [[TMP106]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV38_2:%.*]] = zext i8 [[TMP38]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP63:%.*]] = load i8, ptr [[ARRAYIDX39_3]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP102:%.*]] = load i8, ptr [[ARRAYIDX27_3]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP64:%.*]] = load i8, ptr [[ARRAYIDX15_3]], align 1
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP65:%.*]] = load i8, ptr [[ARRAYIDX5_4]], align 1
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV40_3:%.*]] = zext i8 [[TMP13]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV6:%.*]] = zext i8 [[TMP13]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV6_1:%.*]] = zext i8 [[TMP29]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV6_3:%.*]] = zext i8 [[TMP65]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV6_2:%.*]] = zext i8 [[TMP45]] to i32
@@ -476,33 +476,25 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV28_3:%.*]] = zext i8 [[TMP102]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV28_2:%.*]] = zext i8 [[TMP43]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV40:%.*]] = zext i8 [[TMP10]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[CONV40_1:%.*]] = zext i8 [[TMP26]] to i32
+; UNALIGNED_VEC_MEM-NEXT:    [[CONV40_1:%.*]] = zext i8 [[TMP49]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV40_4:%.*]] = zext i8 [[TMP63]] to i32
 ; UNALIGNED_VEC_MEM-NEXT:    [[CONV40_2:%.*]] = zext i8 [[TMP42]] to i32
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB41_3:%.*]] = sub i32 [[CONV38_3]], [[CONV40_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB7_1:%.*]] = sub i32 [[CONV4_1]], [[CONV6_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB7_3:%.*]] = sub i32 [[CONV4_3]], [[CONV6_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB7_2:%.*]] = sub i32 [[CONV4_2]], [[CONV6_2]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB17:%.*]] = sub i32 [[CONV14]], [[CONV16]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB17_1:%.*]] = sub i32 [[CONV14_1]], [[CONV16_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB17_3:%.*]] = sub i32 [[CONV14_3]], [[CONV16_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB17_2:%.*]] = sub i32 [[CONV14_2]], [[CONV16_2]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB29:%.*]] = sub i32 [[CONV26]], [[CONV28]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB29_1:%.*]] = sub i32 [[CONV26_1]], [[CONV28_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB29_3:%.*]] = sub i32 [[CONV26_3]], [[CONV28_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB29_2:%.*]] = sub i32 [[CONV26_2]], [[CONV28_2]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB41:%.*]] = sub i32 [[CONV38]], [[CONV40]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB41_1:%.*]] = sub i32 [[CONV38_1]], [[CONV40_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB41_4:%.*]] = sub i32 [[CONV38_4]], [[CONV40_4]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB41_2:%.*]] = sub i32 [[CONV38_2]], [[CONV40_2]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SHL42_3:%.*]] = shl i32 [[SUB41_3]], 16
-; UNALIGNED_VEC_MEM-NEXT:    [[SHL_1:%.*]] = shl i32 [[SUB7_1]], 16
-; UNALIGNED_VEC_MEM-NEXT:    [[SHL_3:%.*]] = shl i32 [[SUB7_3]], 16
-; UNALIGNED_VEC_MEM-NEXT:    [[SHL_2:%.*]] = shl i32 [[SUB7_2]], 16
-; UNALIGNED_VEC_MEM-NEXT:    [[SHL18:%.*]] = shl i32 [[SUB17]], 16
-; UNALIGNED_VEC_MEM-NEXT:    [[SHL18_1:%.*]] = shl i32 [[SUB17_1]], 16
-; UNALIGNED_VEC_MEM-NEXT:    [[SHL18_3:%.*]] = shl i32 [[SUB17_3]], 16
-; UNALIGNED_VEC_MEM-NEXT:    [[SHL18_2:%.*]] = shl i32 [[SUB17_2]], 16
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB29:%.*]] = sub i32 [[CONV4]], [[CONV6]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB29_1:%.*]] = sub i32 [[CONV4_1]], [[CONV6_1]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB29_3:%.*]] = sub i32 [[CONV4_4]], [[CONV6_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB29_2:%.*]] = sub i32 [[CONV4_2]], [[CONV6_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB41:%.*]] = sub i32 [[CONV14]], [[CONV16]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB41_1:%.*]] = sub i32 [[CONV14_1]], [[CONV16_1]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB41_4:%.*]] = sub i32 [[CONV14_3]], [[CONV16_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB41_2:%.*]] = sub i32 [[CONV14_2]], [[CONV16_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB30:%.*]] = sub i32 [[CONV26]], [[CONV28]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB29_4:%.*]] = sub i32 [[CONV26_1]], [[CONV28_1]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB29_5:%.*]] = sub i32 [[CONV26_3]], [[CONV28_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB29_6:%.*]] = sub i32 [[CONV26_2]], [[CONV28_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB42:%.*]] = sub i32 [[CONV38]], [[CONV40]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB41_5:%.*]] = sub i32 [[CONV38_1]], [[CONV40_1]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB41_3:%.*]] = sub i32 [[CONV38_3]], [[CONV40_4]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB41_6:%.*]] = sub i32 [[CONV38_2]], [[CONV40_2]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[SHL30:%.*]] = shl i32 [[SUB29]], 16
 ; UNALIGNED_VEC_MEM-NEXT:    [[SHL30_1:%.*]] = shl i32 [[SUB29_1]], 16
 ; UNALIGNED_VEC_MEM-NEXT:    [[SHL30_3:%.*]] = shl i32 [[SUB29_3]], 16
@@ -511,38 +503,46 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; UNALIGNED_VEC_MEM-NEXT:    [[SHL42_1:%.*]] = shl i32 [[SUB41_1]], 16
 ; UNALIGNED_VEC_MEM-NEXT:    [[SHL42_4:%.*]] = shl i32 [[SUB41_4]], 16
 ; UNALIGNED_VEC_MEM-NEXT:    [[SHL42_2:%.*]] = shl i32 [[SUB41_2]], 16
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD43_3:%.*]] = add i32 [[SHL42_3]], [[SUB36_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD_1:%.*]] = add i32 [[SHL_1]], [[SUB_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD_3:%.*]] = add i32 [[SHL_3]], [[SUB_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD_2:%.*]] = add i32 [[SHL_2]], [[SUB_2]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD19:%.*]] = add i32 [[SHL18]], [[SUB12]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD19_1:%.*]] = add i32 [[SHL18_1]], [[SUB12_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD19_3:%.*]] = add i32 [[SHL18_3]], [[SUB12_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD19_2:%.*]] = add i32 [[SHL18_2]], [[SUB12_2]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD31:%.*]] = add i32 [[SHL30]], [[SUB24]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[SUB24_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD31_3:%.*]] = add i32 [[SHL30_3]], [[SUB24_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD31_2:%.*]] = add i32 [[SHL30_2]], [[SUB24_2]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD43:%.*]] = add i32 [[SHL42]], [[SUB36]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD43_1:%.*]] = add i32 [[SHL42_1]], [[SUB36_1]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SHL31:%.*]] = shl i32 [[SUB30]], 16
+; UNALIGNED_VEC_MEM-NEXT:    [[SHL30_4:%.*]] = shl i32 [[SUB29_4]], 16
+; UNALIGNED_VEC_MEM-NEXT:    [[SHL30_5:%.*]] = shl i32 [[SUB29_5]], 16
+; UNALIGNED_VEC_MEM-NEXT:    [[SHL30_6:%.*]] = shl i32 [[SUB29_6]], 16
+; UNALIGNED_VEC_MEM-NEXT:    [[SHL43:%.*]] = shl i32 [[SUB42]], 16
+; UNALIGNED_VEC_MEM-NEXT:    [[SHL42_5:%.*]] = shl i32 [[SUB41_5]], 16
+; UNALIGNED_VEC_MEM-NEXT:    [[SHL42_3:%.*]] = shl i32 [[SUB41_3]], 16
+; UNALIGNED_VEC_MEM-NEXT:    [[SHL42_6:%.*]] = shl i32 [[SUB41_6]], 16
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD_3:%.*]] = add i32 [[SHL30]], [[SUB_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD_2:%.*]] = add i32 [[SHL30_1]], [[SUB_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD19_3:%.*]] = add i32 [[SHL30_3]], [[SUB12_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD19_2:%.*]] = add i32 [[SHL30_2]], [[SUB12_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD31_3:%.*]] = add i32 [[SHL42]], [[SUB24_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD31_2:%.*]] = add i32 [[SHL42_1]], [[SUB24_2]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[ADD43_4:%.*]] = add i32 [[SHL42_4]], [[SUB36_4]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[ADD43_2:%.*]] = add i32 [[SHL42_2]], [[SUB36_2]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB45:%.*]] = sub i32 [[ADD43_3]], [[ADD19]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB45_1:%.*]] = sub i32 [[ADD_1]], [[ADD19_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB45_3:%.*]] = sub i32 [[ADD_3]], [[ADD19_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB45_2:%.*]] = sub i32 [[ADD_2]], [[ADD19_2]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD44:%.*]] = add i32 [[ADD19]], [[ADD43_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD44_1:%.*]] = add i32 [[ADD19_1]], [[ADD_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD44_3:%.*]] = add i32 [[ADD19_3]], [[ADD_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD44_2:%.*]] = add i32 [[ADD19_2]], [[ADD_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD31:%.*]] = add i32 [[SHL31]], [[SUB24]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD31_1:%.*]] = add i32 [[SHL30_4]], [[SUB24_1]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD31_4:%.*]] = add i32 [[SHL30_5]], [[SUB24_4]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD31_5:%.*]] = add i32 [[SHL30_6]], [[SUB24_5]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD43:%.*]] = add i32 [[SHL43]], [[SUB36]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD43_1:%.*]] = add i32 [[SHL42_5]], [[SUB36_1]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD43_3:%.*]] = add i32 [[SHL42_3]], [[SUB36_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD43_5:%.*]] = add i32 [[SHL42_6]], [[SUB36_5]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB45:%.*]] = sub i32 [[ADD_3]], [[ADD31_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB45_1:%.*]] = sub i32 [[ADD_2]], [[ADD31_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB45_3:%.*]] = sub i32 [[ADD19_3]], [[ADD43_4]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB45_2:%.*]] = sub i32 [[ADD19_2]], [[ADD43_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD44:%.*]] = add i32 [[ADD31_3]], [[ADD_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD44_1:%.*]] = add i32 [[ADD31_2]], [[ADD_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD44_3:%.*]] = add i32 [[ADD43_4]], [[ADD19_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD44_2:%.*]] = add i32 [[ADD43_2]], [[ADD19_2]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[SUB47:%.*]] = sub i32 [[ADD31]], [[ADD43]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[SUB47_1:%.*]] = sub i32 [[ADD31_1]], [[ADD43_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB47_3:%.*]] = sub i32 [[ADD31_3]], [[ADD43_4]]
-; UNALIGNED_VEC_MEM-NEXT:    [[SUB47_2:%.*]] = sub i32 [[ADD31_2]], [[ADD43_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB47_3:%.*]] = sub i32 [[ADD31_4]], [[ADD43_3]]
+; UNALIGNED_VEC_MEM-NEXT:    [[SUB47_2:%.*]] = sub i32 [[ADD31_5]], [[ADD43_5]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[ADD46:%.*]] = add i32 [[ADD43]], [[ADD31]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[ADD46_1:%.*]] = add i32 [[ADD43_1]], [[ADD31_1]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD46_3:%.*]] = add i32 [[ADD43_4]], [[ADD31_3]]
-; UNALIGNED_VEC_MEM-NEXT:    [[ADD46_2:%.*]] = add i32 [[ADD43_2]], [[ADD31_2]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD46_3:%.*]] = add i32 [[ADD43_3]], [[ADD31_4]]
+; UNALIGNED_VEC_MEM-NEXT:    [[ADD46_2:%.*]] = add i32 [[ADD43_5]], [[ADD31_5]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[SUB59:%.*]] = sub i32 [[SUB45]], [[SUB47]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[SUB59_1:%.*]] = sub i32 [[SUB45_1]], [[SUB47_1]]
 ; UNALIGNED_VEC_MEM-NEXT:    [[SUB59_3:%.*]] = sub i32 [[SUB45_3]], [[SUB47_3]]
@@ -599,11 +599,11 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP71:%.*]] = insertelement <16 x i32> [[TMP70]], i32 [[CONV9_2]], i32 5
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP72:%.*]] = insertelement <16 x i32> [[TMP71]], i32 [[SUB47_1]], i32 6
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP73:%.*]] = insertelement <16 x i32> [[TMP72]], i32 [[SUB47]], i32 7
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP74:%.*]] = insertelement <16 x i32> [[TMP73]], i32 [[CONV_1]], i32 8
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP75:%.*]] = insertelement <16 x i32> [[TMP74]], i32 [[CONV9_1]], i32 9
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP76:%.*]] = insertelement <16 x i32> [[TMP75]], i32 [[CONV21_1]], i32 10
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP74:%.*]] = insertelement <16 x i32> [[TMP73]], i32 [[CONV11_3]], i32 8
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP75:%.*]] = insertelement <16 x i32> [[TMP74]], i32 [[CONV23_3]], i32 9
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP76:%.*]] = insertelement <16 x i32> [[TMP75]], i32 [[CONV35_3]], i32 10
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP77:%.*]] = insertelement <16 x i32> [[TMP76]], i32 [[ADD44]], i32 11
-; UNALIGNED_VEC_MEM-NEXT:    [[TMP78:%.*]] = zext <4 x i8> [[TMP51]] to <4 x i32>
+; UNALIGNED_VEC_MEM-NEXT:    [[TMP78:%.*]] = zext <4 x i8> [[TMP26]] to <4 x i32>
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP79:%.*]] = shufflevector <4 x i32> [[TMP78]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP80:%.*]] = shufflevector <16 x i32> [[TMP77]], <16 x i32> [[TMP79]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; UNALIGNED_VEC_MEM-NEXT:    [[TMP81:%.*]] = lshr <16 x i32> [[TMP80]], splat (i32 15)
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index 1b9dbaca0a34d..68ffc15b063ba 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -9,62 +9,18 @@
 ;
 
 define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
-; SSE2-LABEL: @dot4f64(
-; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
-; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
-; SSE2-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
-; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
-; SSE2-NEXT:    [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]]
-; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 2
-; SSE2-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]]
-; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
-; SSE2-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP7]]
-; SSE2-NEXT:    ret double [[DOT0123]]
-;
-; SSE4-LABEL: @dot4f64(
-; SSE4-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
-; SSE4-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
-; SSE4-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
-; SSE4-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
-; SSE4-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
-; SSE4-NEXT:    [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]]
-; SSE4-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 2
-; SSE4-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]]
-; SSE4-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
-; SSE4-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP7]]
-; SSE4-NEXT:    ret double [[DOT0123]]
-;
-; AVX-LABEL: @dot4f64(
-; AVX-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 2
-; AVX-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 2
-; AVX-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX]], align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY]], align 4
-; AVX-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
-; AVX-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP4]], [[TMP5]]
-; AVX-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; AVX-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; AVX-NEXT:    [[DOT01:%.*]] = fadd double [[TMP7]], [[TMP8]]
-; AVX-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; AVX-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP9]]
-; AVX-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; AVX-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP10]]
-; AVX-NEXT:    ret double [[DOT0123]]
-;
-; AVX2-LABEL: @dot4f64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
-; AVX2-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
-; AVX2-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
-; AVX2-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
-; AVX2-NEXT:    [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]]
-; AVX2-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 2
-; AVX2-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]]
-; AVX2-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
-; AVX2-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP7]]
-; AVX2-NEXT:    ret double [[DOT0123]]
+; CHECK-LABEL: @dot4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP3]], i32 1
+; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP3]], i32 2
+; CHECK-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP3]], i32 3
+; CHECK-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP7]]
+; CHECK-NEXT:    ret double [[DOT0123]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
   %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
index 822f1051d45d6..1f617cf106b8b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
@@ -445,73 +445,18 @@ define float @reduction_v4f32_nnan(ptr %p) {
 ; Negative test - must have nnan.
 
 define float @reduction_v4f32_not_fast(ptr %p) {
-; SSE-LABEL: @reduction_v4f32_not_fast(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
-; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; SSE-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP3]], float [[TMP2]])
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; SSE-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP4]], float [[M1]])
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; SSE-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP5]], float [[M2]])
-; SSE-NEXT:    ret float [[M3]]
-;
-; COREI7-LABEL: @reduction_v4f32_not_fast(
-; COREI7-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
-; COREI7-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; COREI7-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; COREI7-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP3]], float [[TMP2]])
-; COREI7-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; COREI7-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP4]], float [[M1]])
-; COREI7-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; COREI7-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP5]], float [[M2]])
-; COREI7-NEXT:    ret float [[M3]]
-;
-; BDVER1-LABEL: @reduction_v4f32_not_fast(
-; BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
-; BDVER1-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; BDVER1-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP3]], float [[TMP2]])
-; BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; BDVER1-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP4]], float [[M1]])
-; BDVER1-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; BDVER1-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP5]], float [[M2]])
-; BDVER1-NEXT:    ret float [[M3]]
-;
-; AVX2-LABEL: @reduction_v4f32_not_fast(
-; AVX2-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 1
-; AVX2-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
-; AVX2-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
-; AVX2-NEXT:    [[T0:%.*]] = load float, ptr [[P]], align 4
-; AVX2-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
-; AVX2-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
-; AVX2-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
-; AVX2-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[T1]], float [[T0]])
-; AVX2-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[T2]], float [[M1]])
-; AVX2-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[T3]], float [[M2]])
-; AVX2-NEXT:    ret float [[M3]]
-;
-; AVX512-LABEL: @reduction_v4f32_not_fast(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; AVX512-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP3]], float [[TMP2]])
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; AVX512-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP4]], float [[M1]])
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; AVX512-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP5]], float [[M2]])
-; AVX512-NEXT:    ret float [[M3]]
-;
-; AVX256-LABEL: @reduction_v4f32_not_fast(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; AVX256-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; AVX256-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP3]], float [[TMP2]])
-; AVX256-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; AVX256-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP4]], float [[M1]])
-; AVX256-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; AVX256-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[TMP5]], float [[M2]])
-; AVX256-NEXT:    ret float [[M3]]
+; CHECK-LABEL: @reduction_v4f32_not_fast(
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
+; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[P]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT:    [[M1:%.*]] = tail call float @llvm.maxnum.f32(float [[T1]], float [[T0]])
+; CHECK-NEXT:    [[M2:%.*]] = tail call float @llvm.maxnum.f32(float [[T2]], float [[M1]])
+; CHECK-NEXT:    [[M3:%.*]] = tail call float @llvm.maxnum.f32(float [[T3]], float [[M2]])
+; CHECK-NEXT:    ret float [[M3]]
 ;
 ; PREF-AVX256-LABEL: @reduction_v4f32_not_fast(
 ; PREF-AVX256-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
@@ -604,77 +549,18 @@ define double @reduction_v4f64_fast(ptr %p) {
 ; Negative test - must have nnan.
 
 define double @reduction_v4f64_wrong_fmf(ptr %p) {
-; SSE-LABEL: @reduction_v4f64_wrong_fmf(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
-; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; SSE-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP3]], double [[TMP2]])
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; SSE-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP4]], double [[M1]])
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; SSE-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP5]], double [[M2]])
-; SSE-NEXT:    ret double [[M3]]
-;
-; COREI7-LABEL: @reduction_v4f64_wrong_fmf(
-; COREI7-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
-; COREI7-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
-; COREI7-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
-; COREI7-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
-; COREI7-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
-; COREI7-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
-; COREI7-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
-; COREI7-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T1]], double [[T0]])
-; COREI7-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T2]], double [[M1]])
-; COREI7-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T3]], double [[M2]])
-; COREI7-NEXT:    ret double [[M3]]
-;
-; BDVER1-LABEL: @reduction_v4f64_wrong_fmf(
-; BDVER1-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
-; BDVER1-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
-; BDVER1-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
-; BDVER1-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
-; BDVER1-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
-; BDVER1-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
-; BDVER1-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
-; BDVER1-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T1]], double [[T0]])
-; BDVER1-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T2]], double [[M1]])
-; BDVER1-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T3]], double [[M2]])
-; BDVER1-NEXT:    ret double [[M3]]
-;
-; AVX2-LABEL: @reduction_v4f64_wrong_fmf(
-; AVX2-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
-; AVX2-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
-; AVX2-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
-; AVX2-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
-; AVX2-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
-; AVX2-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
-; AVX2-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
-; AVX2-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T1]], double [[T0]])
-; AVX2-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T2]], double [[M1]])
-; AVX2-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T3]], double [[M2]])
-; AVX2-NEXT:    ret double [[M3]]
-;
-; AVX512-LABEL: @reduction_v4f64_wrong_fmf(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; AVX512-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP3]], double [[TMP2]])
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; AVX512-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP4]], double [[M1]])
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; AVX512-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP5]], double [[M2]])
-; AVX512-NEXT:    ret double [[M3]]
-;
-; AVX256-LABEL: @reduction_v4f64_wrong_fmf(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; AVX256-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; AVX256-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP3]], double [[TMP2]])
-; AVX256-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; AVX256-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP4]], double [[M1]])
-; AVX256-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; AVX256-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[TMP5]], double [[M2]])
-; AVX256-NEXT:    ret double [[M3]]
+; CHECK-LABEL: @reduction_v4f64_wrong_fmf(
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
+; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; CHECK-NEXT:    [[M1:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T1]], double [[T0]])
+; CHECK-NEXT:    [[M2:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T2]], double [[M1]])
+; CHECK-NEXT:    [[M3:%.*]] = tail call ninf nsz double @llvm.maxnum.f64(double [[T3]], double [[M2]])
+; CHECK-NEXT:    ret double [[M3]]
 ;
 ; PREF-AVX256-LABEL: @reduction_v4f64_wrong_fmf(
 ; PREF-AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
index 5eb2328a3dc8f..d0580bdfcfe9d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
@@ -445,73 +445,18 @@ define float @reduction_v4f32_nnan(ptr %p) {
 ; Negative test - must have nnan.
 
 define float @reduction_v4f32_wrong_fmf(ptr %p) {
-; SSE-LABEL: @reduction_v4f32_wrong_fmf(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
-; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; SSE-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; SSE-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP4]], float [[M1]])
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; SSE-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP5]], float [[M2]])
-; SSE-NEXT:    ret float [[M3]]
-;
-; COREI7-LABEL: @reduction_v4f32_wrong_fmf(
-; COREI7-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
-; COREI7-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; COREI7-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; COREI7-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
-; COREI7-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; COREI7-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP4]], float [[M1]])
-; COREI7-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; COREI7-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP5]], float [[M2]])
-; COREI7-NEXT:    ret float [[M3]]
-;
-; BDVER1-LABEL: @reduction_v4f32_wrong_fmf(
-; BDVER1-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
-; BDVER1-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; BDVER1-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; BDVER1-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
-; BDVER1-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; BDVER1-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP4]], float [[M1]])
-; BDVER1-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; BDVER1-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP5]], float [[M2]])
-; BDVER1-NEXT:    ret float [[M3]]
-;
-; AVX2-LABEL: @reduction_v4f32_wrong_fmf(
-; AVX2-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 1
-; AVX2-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
-; AVX2-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
-; AVX2-NEXT:    [[T0:%.*]] = load float, ptr [[P]], align 4
-; AVX2-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
-; AVX2-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
-; AVX2-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
-; AVX2-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[T1]], float [[T0]])
-; AVX2-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[T2]], float [[M1]])
-; AVX2-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[T3]], float [[M2]])
-; AVX2-NEXT:    ret float [[M3]]
-;
-; AVX512-LABEL: @reduction_v4f32_wrong_fmf(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; AVX512-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; AVX512-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP4]], float [[M1]])
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; AVX512-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP5]], float [[M2]])
-; AVX512-NEXT:    ret float [[M3]]
-;
-; AVX256-LABEL: @reduction_v4f32_wrong_fmf(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
-; AVX256-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
-; AVX256-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
-; AVX256-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; AVX256-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP4]], float [[M1]])
-; AVX256-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
-; AVX256-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[TMP5]], float [[M2]])
-; AVX256-NEXT:    ret float [[M3]]
+; CHECK-LABEL: @reduction_v4f32_wrong_fmf(
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, ptr [[P]], i64 2
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, ptr [[P]], i64 3
+; CHECK-NEXT:    [[T0:%.*]] = load float, ptr [[P]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load float, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load float, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load float, ptr [[G3]], align 4
+; CHECK-NEXT:    [[M1:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[T1]], float [[T0]])
+; CHECK-NEXT:    [[M2:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[T2]], float [[M1]])
+; CHECK-NEXT:    [[M3:%.*]] = tail call reassoc nsz float @llvm.minnum.f32(float [[T3]], float [[M2]])
+; CHECK-NEXT:    ret float [[M3]]
 ;
   %g1 = getelementptr inbounds float, ptr %p, i64 1
   %g2 = getelementptr inbounds float, ptr %p, i64 2
@@ -594,77 +539,18 @@ define double @reduction_v4f64_fast(ptr %p) {
 ; Negative test - must have nnan.
 
 define double @reduction_v4f64_not_fast(ptr %p) {
-; SSE-LABEL: @reduction_v4f64_not_fast(
-; SSE-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
-; SSE-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; SSE-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; SSE-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[TMP3]], double [[TMP2]])
-; SSE-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; SSE-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[TMP4]], double [[M1]])
-; SSE-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; SSE-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[TMP5]], double [[M2]])
-; SSE-NEXT:    ret double [[M3]]
-;
-; COREI7-LABEL: @reduction_v4f64_not_fast(
-; COREI7-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
-; COREI7-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
-; COREI7-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
-; COREI7-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
-; COREI7-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
-; COREI7-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
-; COREI7-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
-; COREI7-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[T1]], double [[T0]])
-; COREI7-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[T2]], double [[M1]])
-; COREI7-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[T3]], double [[M2]])
-; COREI7-NEXT:    ret double [[M3]]
-;
-; BDVER1-LABEL: @reduction_v4f64_not_fast(
-; BDVER1-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
-; BDVER1-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
-; BDVER1-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
-; BDVER1-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
-; BDVER1-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
-; BDVER1-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
-; BDVER1-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
-; BDVER1-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[T1]], double [[T0]])
-; BDVER1-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[T2]], double [[M1]])
-; BDVER1-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[T3]], double [[M2]])
-; BDVER1-NEXT:    ret double [[M3]]
-;
-; AVX2-LABEL: @reduction_v4f64_not_fast(
-; AVX2-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
-; AVX2-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
-; AVX2-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
-; AVX2-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
-; AVX2-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
-; AVX2-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
-; AVX2-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
-; AVX2-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[T1]], double [[T0]])
-; AVX2-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[T2]], double [[M1]])
-; AVX2-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[T3]], double [[M2]])
-; AVX2-NEXT:    ret double [[M3]]
-;
-; AVX512-LABEL: @reduction_v4f64_not_fast(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; AVX512-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; AVX512-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[TMP3]], double [[TMP2]])
-; AVX512-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; AVX512-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[TMP4]], double [[M1]])
-; AVX512-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; AVX512-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[TMP5]], double [[M2]])
-; AVX512-NEXT:    ret double [[M3]]
-;
-; AVX256-LABEL: @reduction_v4f64_not_fast(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[P:%.*]], align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
-; AVX256-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
-; AVX256-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[TMP3]], double [[TMP2]])
-; AVX256-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
-; AVX256-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[TMP4]], double [[M1]])
-; AVX256-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
-; AVX256-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[TMP5]], double [[M2]])
-; AVX256-NEXT:    ret double [[M3]]
+; CHECK-LABEL: @reduction_v4f64_not_fast(
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, ptr [[P]], i64 2
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, ptr [[P]], i64 3
+; CHECK-NEXT:    [[T0:%.*]] = load double, ptr [[P]], align 4
+; CHECK-NEXT:    [[T1:%.*]] = load double, ptr [[G1]], align 4
+; CHECK-NEXT:    [[T2:%.*]] = load double, ptr [[G2]], align 4
+; CHECK-NEXT:    [[T3:%.*]] = load double, ptr [[G3]], align 4
+; CHECK-NEXT:    [[M1:%.*]] = tail call double @llvm.minnum.f64(double [[T1]], double [[T0]])
+; CHECK-NEXT:    [[M2:%.*]] = tail call double @llvm.minnum.f64(double [[T2]], double [[M1]])
+; CHECK-NEXT:    [[M3:%.*]] = tail call double @llvm.minnum.f64(double [[T3]], double [[M2]])
+; CHECK-NEXT:    ret double [[M3]]
 ;
   %g1 = getelementptr inbounds double, ptr %p, i64 1
   %g2 = getelementptr inbounds double, ptr %p, i64 2
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index b7bd3e41b0d29..b6f1659c1bc59 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -901,15 +901,15 @@ define i32 @maxi8_mutiple_uses2(i32) {
 ; DEFAULT-NEXT:    ret i32 [[TMP17]]
 ;
 ; THRESH-LABEL: @maxi8_mutiple_uses2(
-; THRESH-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16
-; THRESH-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
-; THRESH-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; THRESH-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
+; THRESH-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; THRESH-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
 ; THRESH-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]]
 ; THRESH-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]]
-; THRESH-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
+; THRESH-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
 ; THRESH-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[TMP6]], [[TMP7]]
 ; THRESH-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP6]], i32 [[TMP7]]
-; THRESH-NEXT:    [[TMP10:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
+; THRESH-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
 ; THRESH-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP9]], [[TMP10]]
 ; THRESH-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[TMP9]], i32 [[TMP10]]
 ; THRESH-NEXT:    [[TMP13:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
index d2e3d82cbe10a..a192808490511 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll
@@ -635,15 +635,18 @@ define double @splat_loads(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; SSE-LABEL: @splat_loads(
 ; SSE-NEXT:  entry:
 ; SSE-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAY1:%.*]], align 8
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; SSE-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAY2:%.*]], align 8
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; SSE-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP0]], [[TMP2]]
-; SSE-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP3]], [[TMP4]]
-; SSE-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-; SSE-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
+; SSE-NEXT:    [[TMP4:%.*]] = fmul <4 x double> [[TMP2]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP4]], i32 0
+; SSE-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP4]], i32 2
 ; SSE-NEXT:    [[ADD3:%.*]] = fadd double [[TMP6]], [[TMP7]]
-; SSE-NEXT:    ret double [[ADD3]]
+; SSE-NEXT:    [[TMP9:%.*]] = extractelement <4 x double> [[TMP4]], i32 1
+; SSE-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
+; SSE-NEXT:    [[ADD2:%.*]] = fadd double [[TMP9]], [[TMP8]]
+; SSE-NEXT:    [[ADD4:%.*]] = fadd double [[ADD3]], [[ADD2]]
+; SSE-NEXT:    ret double [[ADD4]]
 ;
 ; AVX-LABEL: @splat_loads(
 ; AVX-NEXT:  entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll
index df57d02d53fc2..22e9d97055f69 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-split-non-schedulable.ll
@@ -44,7 +44,6 @@ define i32 @main(ptr %c, i32 %0, i1 %tobool4.not, i16 %1) {
 ; CHECK-NEXT:    [[TMP23:%.*]] = phi <8 x i32> [ [[TMP22]], %[[AH]] ], [ [[TMP20]], %[[IF_END14]] ]
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <8 x i32> [[TMP23]], i32 5
 ; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[TMP23]], i32 7
-; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> poison, <2 x i32> <i32 5, i32 7>
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP25]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <8 x i32> [[TMP23]], i32 4
 ; CHECK-NEXT:    [[TMP29:%.*]] = or i32 [[ADD]], [[TMP28]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index caae1e3dc7da8..5fe325030966e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -136,46 +136,55 @@ for.end:                                          ; preds = %for.body
 define float @foo3(ptr nocapture readonly %A) #0 {
 ; CHECK-LABEL: @foo3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX1:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 3
+; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x float> [[TMP14]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP24]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 7>
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP26]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP13]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x float> [ [[TMP10]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
 ; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw i64 [[INDVARS_IV]], 4
-; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
 ; CHECK-NEXT:    [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP12]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 4>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <4 x float> [[TMP21]], <4 x float> [[TMP13]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <4 x float> [[TMP22]], <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00, float 1.000000e+01>
-; CHECK-NEXT:    [[TMP15]] = fadd <4 x float> [[TMP3]], [[TMP14]]
-; CHECK-NEXT:    [[MUL25:%.*]] = fmul float [[TMP8]], 1.100000e+01
+; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP27]], float [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 10, i32 6, i32 11>
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x float> [[TMP28]], float [[R_052]], i32 6
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x float> [[TMP29]], <8 x float> [[TMP21]], <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP23:%.*]] = fmul <8 x float> [[TMP22]], <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[MUL25:%.*]] = extractelement <8 x float> [[TMP23]], i32 1
 ; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL25]]
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x float> [[TMP23]], <8 x float> poison, <4 x i32> <i32 4, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP15]] = fadd <4 x float> [[TMP12]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP16]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP15]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP15]], i32 1
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[TMP17]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP15]], i32 2
-; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP19]]
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[TMP19]], [[ADD6]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP15]], i32 3
 ; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP20]]
-; CHECK-NEXT:    [[ADD32:%.*]] = fadd float [[ADD31]], [[ADD6]]
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x float> [[TMP15]], i32 1
+; CHECK-NEXT:    [[ADD33:%.*]] = fadd float [[ADD31]], [[TMP30]]
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x float> [[TMP15]], i32 0
+; CHECK-NEXT:    [[ADD32:%.*]] = fadd float [[ADD33]], [[TMP31]]
 ; CHECK-NEXT:    ret float [[ADD32]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll
index 80bd8ae07e2e2..6ac77bd77fa02 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder-non-empty.ll
@@ -9,14 +9,13 @@ define double @test01() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> align 8 [[TMP2]], <2 x i1> splat (i1 true), <2 x double> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x double> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd double [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd double [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd double 0.000000e+00, 0.000000e+00
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd double [[TMP5]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd double [[TMP11]], [[TMP9]]
-; CHECK-NEXT:    ret double [[TMP12]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd double [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd double [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd double 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[TMP10:%.*]] = fadd double [[TMP5]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd double [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    ret double [[TMP11]]
 ;
   %1 = load i32, ptr null, align 8
   %2 = load i32, ptr getelementptr inbounds (i32, ptr null, i32 1), align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
index 92760b43e273b..c3dfaf81b97bb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/used-reduced-op.ll
@@ -8,242 +8,139 @@
 define void @n() local_unnamed_addr #0 {
 ; CHECK-LABEL: @n(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @k, align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 1, i64 0), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 1, i64 1), align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 1, i64 2), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 1, i64 3), align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 2, i64 0), align 16
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 2, i64 1), align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 2, i64 2), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 2, i64 3), align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 3, i64 0), align 16
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 3, i64 1), align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 3, i64 2), align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 3, i64 3), align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 4, i64 0), align 16
-; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 4, i64 1), align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 4, i64 2), align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 4, i64 3), align 4
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 5, i64 0), align 16
-; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 5, i64 1), align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 5, i64 2), align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 5, i64 3), align 4
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 6, i64 0), align 16
-; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 6, i64 1), align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 6, i64 2), align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 6, i64 3), align 4
-; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 7, i64 0), align 16
-; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 7, i64 1), align 4
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 7, i64 2), align 8
-; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr getelementptr inbounds ([8 x [4 x i32]], ptr @k, i64 0, i64 7, i64 3), align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x i32>, ptr @k, align 16
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_COND]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[B_0:%.*]] = phi i32 [ [[SPEC_SELECT8_3_7:%.*]], [[FOR_COND]] ], [ undef, [[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP29:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP29]], -183
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> poison, i32 [[TMP30]], i32 0
-; CHECK-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i32> [[TMP31]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP33:%.*]] = sub <4 x i32> [[TMP32]], [[TMP0]]
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp slt <4 x i32> [[TMP33]], zeroinitializer
-; CHECK-NEXT:    [[TMP35:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP33]]
-; CHECK-NEXT:    [[TMP36:%.*]] = select <4 x i1> [[TMP34]], <4 x i32> [[TMP35]], <4 x i32> [[TMP33]]
-; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP36]])
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <32 x i32> poison, i32 [[TMP30]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i32> [[TMP3]], <32 x i32> poison, <32 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <32 x i32> [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt <32 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <32 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <32 x i1> [[TMP6]], <32 x i32> [[TMP7]], <32 x i32> [[TMP5]]
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <32 x i32> [[TMP8]], i32 0
 ; CHECK-NEXT:    [[OP_RDX:%.*]] = icmp slt i32 [[TMP37]], [[B_0]]
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP37]], i32 [[B_0]]
-; CHECK-NEXT:    [[SUB_116:%.*]] = sub i32 [[TMP30]], [[TMP1]]
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp slt i32 [[SUB_116]], 0
-; CHECK-NEXT:    [[NEG_117:%.*]] = sub nsw i32 0, [[SUB_116]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <32 x i32> [[TMP8]], i32 1
+; CHECK-NEXT:    [[CMP12_1:%.*]] = icmp slt i32 [[TMP10]], [[OP_RDX1]]
+; CHECK-NEXT:    [[SUB_116:%.*]] = select i1 [[CMP12_1]], i32 [[TMP10]], i32 [[OP_RDX1]]
+; CHECK-NEXT:    [[NEG_117:%.*]] = extractelement <32 x i32> [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp slt i32 [[NEG_117]], [[SUB_116]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = select i1 [[TMP38]], i32 [[NEG_117]], i32 [[SUB_116]]
-; CHECK-NEXT:    [[CMP12_118:%.*]] = icmp slt i32 [[TMP39]], [[OP_RDX1]]
-; CHECK-NEXT:    [[SPEC_SELECT8_120:%.*]] = select i1 [[CMP12_118]], i32 [[TMP39]], i32 [[OP_RDX1]]
-; CHECK-NEXT:    [[SUB_1_1:%.*]] = sub i32 [[TMP30]], [[TMP2]]
-; CHECK-NEXT:    [[TMP40:%.*]] = icmp slt i32 [[SUB_1_1]], 0
-; CHECK-NEXT:    [[NEG_1_1:%.*]] = sub nsw i32 0, [[SUB_1_1]]
-; CHECK-NEXT:    [[TMP41:%.*]] = select i1 [[TMP40]], i32 [[NEG_1_1]], i32 [[SUB_1_1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <32 x i32> [[TMP8]], i32 3
+; CHECK-NEXT:    [[CMP12_3:%.*]] = icmp slt i32 [[TMP12]], [[TMP39]]
+; CHECK-NEXT:    [[SPEC_SELECT8_3:%.*]] = select i1 [[CMP12_3]], i32 [[TMP12]], i32 [[TMP39]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <32 x i32> [[TMP8]], i32 4
+; CHECK-NEXT:    [[CMP12_118:%.*]] = icmp slt i32 [[TMP13]], [[SPEC_SELECT8_3]]
+; CHECK-NEXT:    [[SPEC_SELECT8_120:%.*]] = select i1 [[CMP12_118]], i32 [[TMP13]], i32 [[SPEC_SELECT8_3]]
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <32 x i32> [[TMP8]], i32 5
 ; CHECK-NEXT:    [[CMP12_1_1:%.*]] = icmp slt i32 [[TMP41]], [[SPEC_SELECT8_120]]
 ; CHECK-NEXT:    [[NARROW:%.*]] = or i1 [[CMP12_1_1]], [[CMP12_118]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_1_1:%.*]] = select i1 [[CMP12_1_1]], i32 [[TMP41]], i32 [[SPEC_SELECT8_120]]
-; CHECK-NEXT:    [[SUB_2_1:%.*]] = sub i32 [[TMP30]], [[TMP3]]
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp slt i32 [[SUB_2_1]], 0
-; CHECK-NEXT:    [[NEG_2_1:%.*]] = sub nsw i32 0, [[SUB_2_1]]
-; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[NEG_2_1]], i32 [[SUB_2_1]]
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <32 x i32> [[TMP8]], i32 6
 ; CHECK-NEXT:    [[CMP12_2_1:%.*]] = icmp slt i32 [[TMP43]], [[SPEC_SELECT8_1_1]]
 ; CHECK-NEXT:    [[NARROW34:%.*]] = or i1 [[CMP12_2_1]], [[NARROW]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_2_1:%.*]] = select i1 [[CMP12_2_1]], i32 [[TMP43]], i32 [[SPEC_SELECT8_1_1]]
-; CHECK-NEXT:    [[SUB_3_1:%.*]] = sub i32 [[TMP30]], [[TMP4]]
-; CHECK-NEXT:    [[TMP44:%.*]] = icmp slt i32 [[SUB_3_1]], 0
-; CHECK-NEXT:    [[NEG_3_1:%.*]] = sub nsw i32 0, [[SUB_3_1]]
-; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[NEG_3_1]], i32 [[SUB_3_1]]
+; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <32 x i32> [[TMP8]], i32 7
 ; CHECK-NEXT:    [[CMP12_3_1:%.*]] = icmp slt i32 [[TMP45]], [[SPEC_SELECT8_2_1]]
 ; CHECK-NEXT:    [[NARROW35:%.*]] = or i1 [[CMP12_3_1]], [[NARROW34]]
 ; CHECK-NEXT:    [[SPEC_SELECT_3_1:%.*]] = zext i1 [[NARROW35]] to i32
 ; CHECK-NEXT:    [[SPEC_SELECT8_3_1:%.*]] = select i1 [[CMP12_3_1]], i32 [[TMP45]], i32 [[SPEC_SELECT8_2_1]]
-; CHECK-NEXT:    [[SUB_222:%.*]] = sub i32 [[TMP30]], [[TMP5]]
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp slt i32 [[SUB_222]], 0
-; CHECK-NEXT:    [[NEG_223:%.*]] = sub nsw i32 0, [[SUB_222]]
-; CHECK-NEXT:    [[TMP47:%.*]] = select i1 [[TMP46]], i32 [[NEG_223]], i32 [[SUB_222]]
+; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <32 x i32> [[TMP8]], i32 8
 ; CHECK-NEXT:    [[CMP12_224:%.*]] = icmp slt i32 [[TMP47]], [[SPEC_SELECT8_3_1]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_226:%.*]] = select i1 [[CMP12_224]], i32 [[TMP47]], i32 [[SPEC_SELECT8_3_1]]
-; CHECK-NEXT:    [[SUB_1_2:%.*]] = sub i32 [[TMP30]], [[TMP6]]
-; CHECK-NEXT:    [[TMP48:%.*]] = icmp slt i32 [[SUB_1_2]], 0
-; CHECK-NEXT:    [[NEG_1_2:%.*]] = sub nsw i32 0, [[SUB_1_2]]
-; CHECK-NEXT:    [[TMP49:%.*]] = select i1 [[TMP48]], i32 [[NEG_1_2]], i32 [[SUB_1_2]]
+; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <32 x i32> [[TMP8]], i32 9
 ; CHECK-NEXT:    [[CMP12_1_2:%.*]] = icmp slt i32 [[TMP49]], [[SPEC_SELECT8_226]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = or i1 [[CMP12_1_2]], [[CMP12_224]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_1_2:%.*]] = select i1 [[CMP12_1_2]], i32 [[TMP49]], i32 [[SPEC_SELECT8_226]]
-; CHECK-NEXT:    [[SUB_2_2:%.*]] = sub i32 [[TMP30]], [[TMP7]]
-; CHECK-NEXT:    [[TMP51:%.*]] = icmp slt i32 [[SUB_2_2]], 0
-; CHECK-NEXT:    [[NEG_2_2:%.*]] = sub nsw i32 0, [[SUB_2_2]]
-; CHECK-NEXT:    [[TMP52:%.*]] = select i1 [[TMP51]], i32 [[NEG_2_2]], i32 [[SUB_2_2]]
+; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <32 x i32> [[TMP8]], i32 10
 ; CHECK-NEXT:    [[CMP12_2_2:%.*]] = icmp slt i32 [[TMP52]], [[SPEC_SELECT8_1_2]]
 ; CHECK-NEXT:    [[TMP53:%.*]] = or i1 [[CMP12_2_2]], [[TMP50]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_2_2:%.*]] = select i1 [[CMP12_2_2]], i32 [[TMP52]], i32 [[SPEC_SELECT8_1_2]]
-; CHECK-NEXT:    [[SUB_3_2:%.*]] = sub i32 [[TMP30]], [[TMP8]]
-; CHECK-NEXT:    [[TMP54:%.*]] = icmp slt i32 [[SUB_3_2]], 0
-; CHECK-NEXT:    [[NEG_3_2:%.*]] = sub nsw i32 0, [[SUB_3_2]]
-; CHECK-NEXT:    [[TMP55:%.*]] = select i1 [[TMP54]], i32 [[NEG_3_2]], i32 [[SUB_3_2]]
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <32 x i32> [[TMP8]], i32 11
 ; CHECK-NEXT:    [[CMP12_3_2:%.*]] = icmp slt i32 [[TMP55]], [[SPEC_SELECT8_2_2]]
 ; CHECK-NEXT:    [[TMP56:%.*]] = or i1 [[CMP12_3_2]], [[TMP53]]
 ; CHECK-NEXT:    [[SPEC_SELECT_3_2:%.*]] = select i1 [[TMP56]], i32 2, i32 [[SPEC_SELECT_3_1]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_3_2:%.*]] = select i1 [[CMP12_3_2]], i32 [[TMP55]], i32 [[SPEC_SELECT8_2_2]]
-; CHECK-NEXT:    [[SUB_328:%.*]] = sub i32 [[TMP30]], [[TMP9]]
-; CHECK-NEXT:    [[TMP57:%.*]] = icmp slt i32 [[SUB_328]], 0
-; CHECK-NEXT:    [[NEG_329:%.*]] = sub nsw i32 0, [[SUB_328]]
-; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP57]], i32 [[NEG_329]], i32 [[SUB_328]]
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <32 x i32> [[TMP8]], i32 12
 ; CHECK-NEXT:    [[CMP12_330:%.*]] = icmp slt i32 [[TMP58]], [[SPEC_SELECT8_3_2]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_332:%.*]] = select i1 [[CMP12_330]], i32 [[TMP58]], i32 [[SPEC_SELECT8_3_2]]
-; CHECK-NEXT:    [[SUB_1_3:%.*]] = sub i32 [[TMP30]], [[TMP10]]
-; CHECK-NEXT:    [[TMP59:%.*]] = icmp slt i32 [[SUB_1_3]], 0
-; CHECK-NEXT:    [[NEG_1_3:%.*]] = sub nsw i32 0, [[SUB_1_3]]
-; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 [[NEG_1_3]], i32 [[SUB_1_3]]
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <32 x i32> [[TMP8]], i32 13
 ; CHECK-NEXT:    [[CMP12_1_3:%.*]] = icmp slt i32 [[TMP60]], [[SPEC_SELECT8_332]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = or i1 [[CMP12_1_3]], [[CMP12_330]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_1_3:%.*]] = select i1 [[CMP12_1_3]], i32 [[TMP60]], i32 [[SPEC_SELECT8_332]]
-; CHECK-NEXT:    [[SUB_2_3:%.*]] = sub i32 [[TMP30]], [[TMP11]]
-; CHECK-NEXT:    [[TMP62:%.*]] = icmp slt i32 [[SUB_2_3]], 0
-; CHECK-NEXT:    [[NEG_2_3:%.*]] = sub nsw i32 0, [[SUB_2_3]]
-; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP62]], i32 [[NEG_2_3]], i32 [[SUB_2_3]]
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <32 x i32> [[TMP8]], i32 14
 ; CHECK-NEXT:    [[CMP12_2_3:%.*]] = icmp slt i32 [[TMP63]], [[SPEC_SELECT8_1_3]]
 ; CHECK-NEXT:    [[TMP64:%.*]] = or i1 [[CMP12_2_3]], [[TMP61]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_2_3:%.*]] = select i1 [[CMP12_2_3]], i32 [[TMP63]], i32 [[SPEC_SELECT8_1_3]]
-; CHECK-NEXT:    [[SUB_3_3:%.*]] = sub i32 [[TMP30]], [[TMP12]]
-; CHECK-NEXT:    [[TMP65:%.*]] = icmp slt i32 [[SUB_3_3]], 0
-; CHECK-NEXT:    [[NEG_3_3:%.*]] = sub nsw i32 0, [[SUB_3_3]]
-; CHECK-NEXT:    [[TMP66:%.*]] = select i1 [[TMP65]], i32 [[NEG_3_3]], i32 [[SUB_3_3]]
+; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <32 x i32> [[TMP8]], i32 15
 ; CHECK-NEXT:    [[CMP12_3_3:%.*]] = icmp slt i32 [[TMP66]], [[SPEC_SELECT8_2_3]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = or i1 [[CMP12_3_3]], [[TMP64]]
 ; CHECK-NEXT:    [[SPEC_SELECT_3_3:%.*]] = select i1 [[TMP67]], i32 3, i32 [[SPEC_SELECT_3_2]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_3_3:%.*]] = select i1 [[CMP12_3_3]], i32 [[TMP66]], i32 [[SPEC_SELECT8_2_3]]
-; CHECK-NEXT:    [[SUB_4:%.*]] = sub i32 [[TMP30]], [[TMP13]]
-; CHECK-NEXT:    [[TMP68:%.*]] = icmp slt i32 [[SUB_4]], 0
-; CHECK-NEXT:    [[NEG_4:%.*]] = sub nsw i32 0, [[SUB_4]]
-; CHECK-NEXT:    [[TMP69:%.*]] = select i1 [[TMP68]], i32 [[NEG_4]], i32 [[SUB_4]]
+; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <32 x i32> [[TMP8]], i32 16
 ; CHECK-NEXT:    [[CMP12_4:%.*]] = icmp slt i32 [[TMP69]], [[SPEC_SELECT8_3_3]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_4:%.*]] = select i1 [[CMP12_4]], i32 [[TMP69]], i32 [[SPEC_SELECT8_3_3]]
-; CHECK-NEXT:    [[SUB_1_4:%.*]] = sub i32 [[TMP30]], [[TMP14]]
-; CHECK-NEXT:    [[TMP70:%.*]] = icmp slt i32 [[SUB_1_4]], 0
-; CHECK-NEXT:    [[NEG_1_4:%.*]] = sub nsw i32 0, [[SUB_1_4]]
-; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[NEG_1_4]], i32 [[SUB_1_4]]
+; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <32 x i32> [[TMP8]], i32 17
 ; CHECK-NEXT:    [[CMP12_1_4:%.*]] = icmp slt i32 [[TMP71]], [[SPEC_SELECT8_4]]
 ; CHECK-NEXT:    [[TMP72:%.*]] = or i1 [[CMP12_1_4]], [[CMP12_4]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_1_4:%.*]] = select i1 [[CMP12_1_4]], i32 [[TMP71]], i32 [[SPEC_SELECT8_4]]
-; CHECK-NEXT:    [[SUB_2_4:%.*]] = sub i32 [[TMP30]], [[TMP15]]
-; CHECK-NEXT:    [[TMP73:%.*]] = icmp slt i32 [[SUB_2_4]], 0
-; CHECK-NEXT:    [[NEG_2_4:%.*]] = sub nsw i32 0, [[SUB_2_4]]
-; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP73]], i32 [[NEG_2_4]], i32 [[SUB_2_4]]
+; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <32 x i32> [[TMP8]], i32 18
 ; CHECK-NEXT:    [[CMP12_2_4:%.*]] = icmp slt i32 [[TMP74]], [[SPEC_SELECT8_1_4]]
 ; CHECK-NEXT:    [[TMP75:%.*]] = or i1 [[CMP12_2_4]], [[TMP72]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_2_4:%.*]] = select i1 [[CMP12_2_4]], i32 [[TMP74]], i32 [[SPEC_SELECT8_1_4]]
-; CHECK-NEXT:    [[SUB_3_4:%.*]] = sub i32 [[TMP30]], [[TMP16]]
-; CHECK-NEXT:    [[TMP76:%.*]] = icmp slt i32 [[SUB_3_4]], 0
-; CHECK-NEXT:    [[NEG_3_4:%.*]] = sub nsw i32 0, [[SUB_3_4]]
-; CHECK-NEXT:    [[TMP77:%.*]] = select i1 [[TMP76]], i32 [[NEG_3_4]], i32 [[SUB_3_4]]
+; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <32 x i32> [[TMP8]], i32 19
 ; CHECK-NEXT:    [[CMP12_3_4:%.*]] = icmp slt i32 [[TMP77]], [[SPEC_SELECT8_2_4]]
 ; CHECK-NEXT:    [[TMP78:%.*]] = or i1 [[CMP12_3_4]], [[TMP75]]
 ; CHECK-NEXT:    [[SPEC_SELECT_3_4:%.*]] = select i1 [[TMP78]], i32 4, i32 [[SPEC_SELECT_3_3]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_3_4:%.*]] = select i1 [[CMP12_3_4]], i32 [[TMP77]], i32 [[SPEC_SELECT8_2_4]]
-; CHECK-NEXT:    [[SUB_5:%.*]] = sub i32 [[TMP30]], [[TMP17]]
-; CHECK-NEXT:    [[TMP79:%.*]] = icmp slt i32 [[SUB_5]], 0
-; CHECK-NEXT:    [[NEG_5:%.*]] = sub nsw i32 0, [[SUB_5]]
-; CHECK-NEXT:    [[TMP80:%.*]] = select i1 [[TMP79]], i32 [[NEG_5]], i32 [[SUB_5]]
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <32 x i32> [[TMP8]], i32 20
 ; CHECK-NEXT:    [[CMP12_5:%.*]] = icmp slt i32 [[TMP80]], [[SPEC_SELECT8_3_4]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_5:%.*]] = select i1 [[CMP12_5]], i32 [[TMP80]], i32 [[SPEC_SELECT8_3_4]]
-; CHECK-NEXT:    [[SUB_1_5:%.*]] = sub i32 [[TMP30]], [[TMP18]]
-; CHECK-NEXT:    [[TMP81:%.*]] = icmp slt i32 [[SUB_1_5]], 0
-; CHECK-NEXT:    [[NEG_1_5:%.*]] = sub nsw i32 0, [[SUB_1_5]]
-; CHECK-NEXT:    [[TMP82:%.*]] = select i1 [[TMP81]], i32 [[NEG_1_5]], i32 [[SUB_1_5]]
+; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <32 x i32> [[TMP8]], i32 21
 ; CHECK-NEXT:    [[CMP12_1_5:%.*]] = icmp slt i32 [[TMP82]], [[SPEC_SELECT8_5]]
 ; CHECK-NEXT:    [[TMP83:%.*]] = or i1 [[CMP12_1_5]], [[CMP12_5]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_1_5:%.*]] = select i1 [[CMP12_1_5]], i32 [[TMP82]], i32 [[SPEC_SELECT8_5]]
-; CHECK-NEXT:    [[SUB_2_5:%.*]] = sub i32 [[TMP30]], [[TMP19]]
-; CHECK-NEXT:    [[TMP84:%.*]] = icmp slt i32 [[SUB_2_5]], 0
-; CHECK-NEXT:    [[NEG_2_5:%.*]] = sub nsw i32 0, [[SUB_2_5]]
-; CHECK-NEXT:    [[TMP85:%.*]] = select i1 [[TMP84]], i32 [[NEG_2_5]], i32 [[SUB_2_5]]
+; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <32 x i32> [[TMP8]], i32 22
 ; CHECK-NEXT:    [[CMP12_2_5:%.*]] = icmp slt i32 [[TMP85]], [[SPEC_SELECT8_1_5]]
 ; CHECK-NEXT:    [[TMP86:%.*]] = or i1 [[CMP12_2_5]], [[TMP83]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_2_5:%.*]] = select i1 [[CMP12_2_5]], i32 [[TMP85]], i32 [[SPEC_SELECT8_1_5]]
-; CHECK-NEXT:    [[SUB_3_5:%.*]] = sub i32 [[TMP30]], [[TMP20]]
-; CHECK-NEXT:    [[TMP87:%.*]] = icmp slt i32 [[SUB_3_5]], 0
-; CHECK-NEXT:    [[NEG_3_5:%.*]] = sub nsw i32 0, [[SUB_3_5]]
-; CHECK-NEXT:    [[TMP88:%.*]] = select i1 [[TMP87]], i32 [[NEG_3_5]], i32 [[SUB_3_5]]
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <32 x i32> [[TMP8]], i32 23
 ; CHECK-NEXT:    [[CMP12_3_5:%.*]] = icmp slt i32 [[TMP88]], [[SPEC_SELECT8_2_5]]
 ; CHECK-NEXT:    [[TMP89:%.*]] = or i1 [[CMP12_3_5]], [[TMP86]]
 ; CHECK-NEXT:    [[SPEC_SELECT_3_5:%.*]] = select i1 [[TMP89]], i32 5, i32 [[SPEC_SELECT_3_4]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_3_5:%.*]] = select i1 [[CMP12_3_5]], i32 [[TMP88]], i32 [[SPEC_SELECT8_2_5]]
-; CHECK-NEXT:    [[SUB_6:%.*]] = sub i32 [[TMP30]], [[TMP21]]
-; CHECK-NEXT:    [[TMP90:%.*]] = icmp slt i32 [[SUB_6]], 0
-; CHECK-NEXT:    [[NEG_6:%.*]] = sub nsw i32 0, [[SUB_6]]
-; CHECK-NEXT:    [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[NEG_6]], i32 [[SUB_6]]
+; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <32 x i32> [[TMP8]], i32 24
 ; CHECK-NEXT:    [[CMP12_6:%.*]] = icmp slt i32 [[TMP91]], [[SPEC_SELECT8_3_5]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_6:%.*]] = select i1 [[CMP12_6]], i32 [[TMP91]], i32 [[SPEC_SELECT8_3_5]]
-; CHECK-NEXT:    [[SUB_1_6:%.*]] = sub i32 [[TMP30]], [[TMP22]]
-; CHECK-NEXT:    [[TMP92:%.*]] = icmp slt i32 [[SUB_1_6]], 0
-; CHECK-NEXT:    [[NEG_1_6:%.*]] = sub nsw i32 0, [[SUB_1_6]]
-; CHECK-NEXT:    [[TMP93:%.*]] = select i1 [[TMP92]], i32 [[NEG_1_6]], i32 [[SUB_1_6]]
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <32 x i32> [[TMP8]], i32 25
 ; CHECK-NEXT:    [[CMP12_1_6:%.*]] = icmp slt i32 [[TMP93]], [[SPEC_SELECT8_6]]
 ; CHECK-NEXT:    [[TMP94:%.*]] = or i1 [[CMP12_1_6]], [[CMP12_6]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_1_6:%.*]] = select i1 [[CMP12_1_6]], i32 [[TMP93]], i32 [[SPEC_SELECT8_6]]
-; CHECK-NEXT:    [[SUB_2_6:%.*]] = sub i32 [[TMP30]], [[TMP23]]
-; CHECK-NEXT:    [[TMP95:%.*]] = icmp slt i32 [[SUB_2_6]], 0
-; CHECK-NEXT:    [[NEG_2_6:%.*]] = sub nsw i32 0, [[SUB_2_6]]
-; CHECK-NEXT:    [[TMP96:%.*]] = select i1 [[TMP95]], i32 [[NEG_2_6]], i32 [[SUB_2_6]]
+; CHECK-NEXT:    [[TMP96:%.*]] = extractelement <32 x i32> [[TMP8]], i32 26
 ; CHECK-NEXT:    [[CMP12_2_6:%.*]] = icmp slt i32 [[TMP96]], [[SPEC_SELECT8_1_6]]
 ; CHECK-NEXT:    [[TMP97:%.*]] = or i1 [[CMP12_2_6]], [[TMP94]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_2_6:%.*]] = select i1 [[CMP12_2_6]], i32 [[TMP96]], i32 [[SPEC_SELECT8_1_6]]
-; CHECK-NEXT:    [[SUB_3_6:%.*]] = sub i32 [[TMP30]], [[TMP24]]
-; CHECK-NEXT:    [[TMP98:%.*]] = icmp slt i32 [[SUB_3_6]], 0
-; CHECK-NEXT:    [[NEG_3_6:%.*]] = sub nsw i32 0, [[SUB_3_6]]
-; CHECK-NEXT:    [[TMP99:%.*]] = select i1 [[TMP98]], i32 [[NEG_3_6]], i32 [[SUB_3_6]]
+; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <32 x i32> [[TMP8]], i32 27
 ; CHECK-NEXT:    [[CMP12_3_6:%.*]] = icmp slt i32 [[TMP99]], [[SPEC_SELECT8_2_6]]
 ; CHECK-NEXT:    [[TMP100:%.*]] = or i1 [[CMP12_3_6]], [[TMP97]]
 ; CHECK-NEXT:    [[SPEC_SELECT_3_6:%.*]] = select i1 [[TMP100]], i32 6, i32 [[SPEC_SELECT_3_5]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_3_6:%.*]] = select i1 [[CMP12_3_6]], i32 [[TMP99]], i32 [[SPEC_SELECT8_2_6]]
-; CHECK-NEXT:    [[SUB_7:%.*]] = sub i32 [[TMP30]], [[TMP25]]
-; CHECK-NEXT:    [[TMP101:%.*]] = icmp slt i32 [[SUB_7]], 0
-; CHECK-NEXT:    [[NEG_7:%.*]] = sub nsw i32 0, [[SUB_7]]
-; CHECK-NEXT:    [[TMP102:%.*]] = select i1 [[TMP101]], i32 [[NEG_7]], i32 [[SUB_7]]
+; CHECK-NEXT:    [[TMP102:%.*]] = extractelement <32 x i32> [[TMP8]], i32 28
 ; CHECK-NEXT:    [[CMP12_7:%.*]] = icmp slt i32 [[TMP102]], [[SPEC_SELECT8_3_6]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_7:%.*]] = select i1 [[CMP12_7]], i32 [[TMP102]], i32 [[SPEC_SELECT8_3_6]]
-; CHECK-NEXT:    [[SUB_1_7:%.*]] = sub i32 [[TMP30]], [[TMP26]]
-; CHECK-NEXT:    [[TMP103:%.*]] = icmp slt i32 [[SUB_1_7]], 0
-; CHECK-NEXT:    [[NEG_1_7:%.*]] = sub nsw i32 0, [[SUB_1_7]]
-; CHECK-NEXT:    [[TMP104:%.*]] = select i1 [[TMP103]], i32 [[NEG_1_7]], i32 [[SUB_1_7]]
+; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <32 x i32> [[TMP8]], i32 29
 ; CHECK-NEXT:    [[CMP12_1_7:%.*]] = icmp slt i32 [[TMP104]], [[SPEC_SELECT8_7]]
 ; CHECK-NEXT:    [[TMP105:%.*]] = or i1 [[CMP12_1_7]], [[CMP12_7]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_1_7:%.*]] = select i1 [[CMP12_1_7]], i32 [[TMP104]], i32 [[SPEC_SELECT8_7]]
-; CHECK-NEXT:    [[SUB_2_7:%.*]] = sub i32 [[TMP30]], [[TMP27]]
-; CHECK-NEXT:    [[TMP106:%.*]] = icmp slt i32 [[SUB_2_7]], 0
-; CHECK-NEXT:    [[NEG_2_7:%.*]] = sub nsw i32 0, [[SUB_2_7]]
-; CHECK-NEXT:    [[TMP107:%.*]] = select i1 [[TMP106]], i32 [[NEG_2_7]], i32 [[SUB_2_7]]
+; CHECK-NEXT:    [[TMP107:%.*]] = extractelement <32 x i32> [[TMP8]], i32 30
 ; CHECK-NEXT:    [[CMP12_2_7:%.*]] = icmp slt i32 [[TMP107]], [[SPEC_SELECT8_1_7]]
 ; CHECK-NEXT:    [[TMP108:%.*]] = or i1 [[CMP12_2_7]], [[TMP105]]
 ; CHECK-NEXT:    [[SPEC_SELECT8_2_7:%.*]] = select i1 [[CMP12_2_7]], i32 [[TMP107]], i32 [[SPEC_SELECT8_1_7]]
-; CHECK-NEXT:    [[SUB_3_7:%.*]] = sub i32 [[TMP30]], [[TMP28]]
-; CHECK-NEXT:    [[TMP109:%.*]] = icmp slt i32 [[SUB_3_7]], 0
-; CHECK-NEXT:    [[NEG_3_7:%.*]] = sub nsw i32 0, [[SUB_3_7]]
-; CHECK-NEXT:    [[TMP110:%.*]] = select i1 [[TMP109]], i32 [[NEG_3_7]], i32 [[SUB_3_7]]
+; CHECK-NEXT:    [[TMP110:%.*]] = extractelement <32 x i32> [[TMP8]], i32 31
 ; CHECK-NEXT:    [[CMP12_3_7:%.*]] = icmp slt i32 [[TMP110]], [[SPEC_SELECT8_2_7]]
 ; CHECK-NEXT:    [[TMP111:%.*]] = or i1 [[CMP12_3_7]], [[TMP108]]
 ; CHECK-NEXT:    [[SPEC_SELECT_3_7:%.*]] = select i1 [[TMP111]], i32 7, i32 [[SPEC_SELECT_3_6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/buildvector-nodes-dependency.ll b/llvm/test/Transforms/SLPVectorizer/buildvector-nodes-dependency.ll
index 36abe96567bb2..7d014bcdfdf16 100644
--- a/llvm/test/Transforms/SLPVectorizer/buildvector-nodes-dependency.ll
+++ b/llvm/test/Transforms/SLPVectorizer/buildvector-nodes-dependency.ll
@@ -1,41 +1,81 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64 < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64 < %s | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefix=AARCH64 %}
 
 define double @test() {
-; CHECK-LABEL: define double @test() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr null, align 8
-; CHECK-NEXT:    br label [[COND_TRUE:%.*]]
-; CHECK:       cond.true:
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub <2 x double> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> zeroinitializer, [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> zeroinitializer, [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = fsub <2 x double> [[TMP13]], [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x double> [[TMP13]], [[TMP2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP14]], <2 x double> [[TMP15]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    [[TMP17:%.*]] = fsub <2 x double> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul <2 x double> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = fmul <2 x double> zeroinitializer, [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = fadd <2 x double> [[TMP19]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = fsub <2 x double> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = fmul <2 x double> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul <2 x double> zeroinitializer, [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fadd <2 x double> [[TMP23]], [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP24]], i32 0
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x double> [[TMP24]], i32 1
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd double [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    ret double [[ADD29]]
+; X86-LABEL: define double @test() {
+; X86-NEXT:  entry:
+; X86-NEXT:    [[TMP0:%.*]] = load double, ptr null, align 8
+; X86-NEXT:    br label [[COND_TRUE:%.*]]
+; X86:       cond.true:
+; X86-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[TMP0]], i32 1
+; X86-NEXT:    [[TMP2:%.*]] = fmul <2 x double> zeroinitializer, [[TMP1]]
+; X86-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; X86-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], zeroinitializer
+; X86-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], zeroinitializer
+; X86-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP4]], zeroinitializer
+; X86-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; X86-NEXT:    [[TMP8:%.*]] = fadd <2 x double> zeroinitializer, [[TMP7]]
+; X86-NEXT:    [[TMP9:%.*]] = fmul <2 x double> zeroinitializer, [[TMP7]]
+; X86-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; X86-NEXT:    [[TMP11:%.*]] = fsub <2 x double> [[TMP10]], [[TMP2]]
+; X86-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP10]], [[TMP2]]
+; X86-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
+; X86-NEXT:    [[TMP14:%.*]] = fsub <2 x double> [[TMP13]], zeroinitializer
+; X86-NEXT:    [[TMP15:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; X86-NEXT:    [[TMP16:%.*]] = fmul <4 x double> [[TMP15]], <double 0.000000e+00, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>
+; X86-NEXT:    [[TMP17:%.*]] = fmul <4 x double> [[TMP16]], <double 0.000000e+00, double 0.000000e+00, double 1.000000e+00, double 1.000000e+00>
+; X86-NEXT:    [[TMP18:%.*]] = fmul <4 x double> zeroinitializer, [[TMP17]]
+; X86-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[TMP14]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> <double poison, double poison, double 0.000000e+00, double 0.000000e+00>, <4 x double> [[TMP19]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; X86-NEXT:    [[TMP21:%.*]] = fadd <4 x double> [[TMP18]], [[TMP20]]
+; X86-NEXT:    [[TMP22:%.*]] = fmul <4 x double> [[TMP18]], [[TMP20]]
+; X86-NEXT:    [[TMP23:%.*]] = shufflevector <4 x double> [[TMP21]], <4 x double> [[TMP22]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; X86-NEXT:    [[TMP24:%.*]] = fsub <4 x double> [[TMP23]], zeroinitializer
+; X86-NEXT:    [[TMP25:%.*]] = fmul <4 x double> [[TMP23]], zeroinitializer
+; X86-NEXT:    [[TMP26:%.*]] = shufflevector <4 x double> [[TMP24]], <4 x double> [[TMP25]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; X86-NEXT:    [[TMP27:%.*]] = extractelement <4 x double> [[TMP26]], i32 1
+; X86-NEXT:    [[TMP28:%.*]] = extractelement <4 x double> [[TMP26]], i32 3
+; X86-NEXT:    [[ADD27:%.*]] = fadd double [[TMP28]], [[TMP27]]
+; X86-NEXT:    [[TMP29:%.*]] = extractelement <4 x double> [[TMP26]], i32 0
+; X86-NEXT:    [[TMP30:%.*]] = extractelement <4 x double> [[TMP26]], i32 2
+; X86-NEXT:    [[ADD12:%.*]] = fadd double [[TMP30]], [[TMP29]]
+; X86-NEXT:    [[ADD29:%.*]] = fadd double [[ADD12]], [[ADD27]]
+; X86-NEXT:    ret double [[ADD29]]
+;
+; AARCH64-LABEL: define double @test() {
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    [[TMP0:%.*]] = load double, ptr null, align 8
+; AARCH64-NEXT:    br label [[COND_TRUE:%.*]]
+; AARCH64:       cond.true:
+; AARCH64-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[TMP0]], i32 1
+; AARCH64-NEXT:    [[TMP2:%.*]] = fmul <2 x double> zeroinitializer, [[TMP1]]
+; AARCH64-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; AARCH64-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], zeroinitializer
+; AARCH64-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP3]], zeroinitializer
+; AARCH64-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; AARCH64-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer
+; AARCH64-NEXT:    [[TMP8:%.*]] = fsub <2 x double> [[TMP7]], zeroinitializer
+; AARCH64-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP7]], zeroinitializer
+; AARCH64-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; AARCH64-NEXT:    [[TMP11:%.*]] = fadd <2 x double> zeroinitializer, [[TMP10]]
+; AARCH64-NEXT:    [[TMP12:%.*]] = fmul <2 x double> zeroinitializer, [[TMP10]]
+; AARCH64-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
+; AARCH64-NEXT:    [[TMP14:%.*]] = fsub <2 x double> [[TMP13]], [[TMP2]]
+; AARCH64-NEXT:    [[TMP15:%.*]] = fadd <2 x double> [[TMP13]], [[TMP2]]
+; AARCH64-NEXT:    [[TMP16:%.*]] = shufflevector <2 x double> [[TMP14]], <2 x double> [[TMP15]], <2 x i32> <i32 0, i32 3>
+; AARCH64-NEXT:    [[TMP17:%.*]] = fsub <2 x double> [[TMP16]], zeroinitializer
+; AARCH64-NEXT:    [[TMP18:%.*]] = fmul <2 x double> [[TMP4]], zeroinitializer
+; AARCH64-NEXT:    [[TMP19:%.*]] = fmul <2 x double> zeroinitializer, [[TMP18]]
+; AARCH64-NEXT:    [[TMP20:%.*]] = fadd <2 x double> [[TMP19]], [[TMP17]]
+; AARCH64-NEXT:    [[TMP21:%.*]] = fsub <2 x double> [[TMP20]], zeroinitializer
+; AARCH64-NEXT:    [[TMP22:%.*]] = fmul <2 x double> [[TMP5]], zeroinitializer
+; AARCH64-NEXT:    [[TMP23:%.*]] = fmul <2 x double> zeroinitializer, [[TMP22]]
+; AARCH64-NEXT:    [[TMP24:%.*]] = fadd <2 x double> [[TMP23]], [[TMP21]]
+; AARCH64-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP24]], i32 0
+; AARCH64-NEXT:    [[TMP26:%.*]] = extractelement <2 x double> [[TMP24]], i32 1
+; AARCH64-NEXT:    [[ADD29:%.*]] = fadd double [[TMP25]], [[TMP26]]
+; AARCH64-NEXT:    ret double [[ADD29]]
 ;
 entry:
   %0 = load double, ptr null, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
index 385df87478e53..78fd8450601e0 100644
--- a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
@@ -70,10 +70,8 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; THRESHOLD-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
 ; THRESHOLD-NEXT:    [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
 ; THRESHOLD-NEXT:    [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; THRESHOLD-NEXT:    [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
 ; THRESHOLD-NEXT:    [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; THRESHOLD-NEXT:    [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
 ; THRESHOLD-NEXT:    [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
 ; THRESHOLD-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
@@ -148,10 +146,8 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; MINTREESIZE-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
 ; MINTREESIZE-NEXT:    [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
 ; MINTREESIZE-NEXT:    [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
-; MINTREESIZE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; MINTREESIZE-NEXT:    [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
 ; MINTREESIZE-NEXT:    [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
-; MINTREESIZE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; MINTREESIZE-NEXT:    [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
 ; MINTREESIZE-NEXT:    [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
 ; MINTREESIZE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
index 37c02d61d6516..1cf9f4cf1058f 100644
--- a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
@@ -105,10 +105,8 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; THRESHOLD-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
 ; THRESHOLD-NEXT:    [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
 ; THRESHOLD-NEXT:    [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
-; THRESHOLD-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; THRESHOLD-NEXT:    [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
 ; THRESHOLD-NEXT:    [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
-; THRESHOLD-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; THRESHOLD-NEXT:    [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
 ; THRESHOLD-NEXT:    [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
 ; THRESHOLD-NEXT:    [[Q6:%.*]] = fadd float [[Q4]], [[Q5]]
@@ -183,10 +181,8 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; MINTREESIZE-NEXT:    [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[S3]], i32 3
 ; MINTREESIZE-NEXT:    [[Q0:%.*]] = extractelement <4 x float> [[RD]], i32 0
 ; MINTREESIZE-NEXT:    [[Q1:%.*]] = extractelement <4 x float> [[RD]], i32 1
-; MINTREESIZE-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; MINTREESIZE-NEXT:    [[Q2:%.*]] = extractelement <4 x float> [[RD]], i32 2
 ; MINTREESIZE-NEXT:    [[Q3:%.*]] = extractelement <4 x float> [[RD]], i32 3
-; MINTREESIZE-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[RD]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
 ; MINTREESIZE-NEXT:    [[Q4:%.*]] = fadd float [[Q0]], [[Q1]]
 ; MINTREESIZE-NEXT:    [[Q5:%.*]] = fadd float [[Q2]], [[Q3]]
 ; MINTREESIZE-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> poison, float [[Q4]], i32 0

>From d04f3807e0dde508633c26ef12f6ae32cf358eb6 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Sat, 21 Feb 2026 18:41:30 -0800
Subject: [PATCH 4/4] Fix crashes

Created using spr 1.3.7
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  5 +-
 llvm/test/Transforms/SLPVectorizer/X86/phi.ll | 52 +++++++++----------
 2 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ce3005795668e..787c18535d2b2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -26260,7 +26260,10 @@ class HorizontalReduction {
               return RedValI && V.isDeleted(RedValI);
             }))
           break;
-        V.buildTree(VL, IgnoreList);
+        if (RK == ReductionKind::Ordered)
+          V.buildTree(VL);
+        else
+          V.buildTree(VL, IgnoreList);
         if (V.isTreeTinyAndNotFullyVectorizable(RK ==
                                                 ReductionKind::Unordered)) {
           if (!AdjustReducedVals())
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index 5fe325030966e..14bff0c3478b5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -136,25 +136,22 @@ for.end:                                          ; preds = %for.body
 define float @foo3(ptr nocapture readonly %A) #0 {
 ; CHECK-LABEL: @foo3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX1:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x float>, ptr [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <4 x float> [[TMP14]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP24]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 7>
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP26]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> [[TMP10]], float [[TMP2]], i32 0
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP13]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = phi <4 x float> [ [[TMP10]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x float> [ [[TMP11]], [[ENTRY]] ], [ [[TMP25:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP24:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
@@ -163,28 +160,31 @@ define float @foo3(ptr nocapture readonly %A) #0 {
 ; CHECK-NEXT:    [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> [[TMP27]], float [[TMP9]], i32 2
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 10, i32 6, i32 11>
-; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <8 x float> [[TMP28]], float [[R_052]], i32 6
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP29:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x float> [[TMP29]], <8 x float> [[TMP21]], <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP23:%.*]] = fmul <8 x float> [[TMP22]], <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT:    [[MUL25:%.*]] = extractelement <8 x float> [[TMP23]], i32 1
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> [[TMP18]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP26:%.*]] = fmul <8 x float> [[TMP19]], <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <8 x float> [[TMP26]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT:    [[TMP25]] = fadd <2 x float> [[TMP12]], [[TMP31]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x float> [[TMP26]], <8 x float> poison, <2 x i32> <i32 3, i32 0>
+; CHECK-NEXT:    [[TMP24]] = fadd <2 x float> [[TMP8]], [[TMP23]]
+; CHECK-NEXT:    [[MUL25:%.*]] = extractelement <8 x float> [[TMP26]], i32 4
 ; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL25]]
-; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <8 x float> [[TMP23]], <8 x float> poison, <4 x i32> <i32 4, i32 3, i32 0, i32 2>
-; CHECK-NEXT:    [[TMP15]] = fadd <4 x float> [[TMP12]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP16]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP15]], i32 2
-; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[TMP19]], [[ADD6]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP15]], i32 3
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x float> [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x float> [[TMP24]], i32 1
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[TMP28]], [[TMP32]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP25]], i32 1
 ; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP20]]
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x float> [[TMP15]], i32 1
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <2 x float> [[TMP24]], i32 0
 ; CHECK-NEXT:    [[ADD33:%.*]] = fadd float [[ADD31]], [[TMP30]]
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x float> [[TMP15]], i32 0
-; CHECK-NEXT:    [[ADD32:%.*]] = fadd float [[ADD33]], [[TMP31]]
+; CHECK-NEXT:    [[ADD32:%.*]] = fadd float [[ADD33]], [[ADD6]]
 ; CHECK-NEXT:    ret float [[ADD32]]
 ;
 entry:



More information about the llvm-commits mailing list