[llvm] [SLP] Support ordered FAdd reductions in SLPVectorizer (PR #146570)

Wed Jul 9 09:08:34 PDT 2025

https://github.com/sc-clulzze updated https://github.com/llvm/llvm-project/pull/146570

>From dc56eb5cc2a3b75b6f7c06068112a0124dddd7bf Mon Sep 17 00:00:00 2001
From: sc-cluzze <d.marakulin at syntacore.com>
Date: Tue, 1 Jul 2025 16:19:06 +0000
Subject: [PATCH 1/2] [SLP] Support ordered FAdd reductions in slp-vectorizer

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 189 ++++++++--
 .../SLPVectorizer/X86/dot-product.ll          |  38 +--
 llvm/test/Transforms/SLPVectorizer/X86/phi.ll |  53 ++-
 .../SLPVectorizer/fadd-scalar-remainder.ll    |  93 +++++
 .../SLPVectorizer/fadd-vectorize.ll           | 323 ++++++++++++++++++
 5 files changed, 605 insertions(+), 91 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/fadd-scalar-remainder.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/fadd-vectorize.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0941bf61953f1..2c7929d91121f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -110,11 +110,16 @@ using namespace std::placeholders;
 #define SV_NAME "slp-vectorizer"
 #define DEBUG_TYPE "SLP"
 
+STATISTIC(NumFaddVectorized, "Number of vectorized fadd reductions");
 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
 
 DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
               "Controls which SLP graphs should be vectorized.");
 
+static cl::opt<bool> SLPEnableOrderedFPReductions(
+    "slp-ordered-fp-reds", cl::init(true), cl::Hidden,
+    cl::desc("Enable vectorization of ordered floating point reductions"));
+
 static cl::opt<bool>
     RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
                         cl::desc("Run the SLP vectorization passes"));
@@ -1850,6 +1855,11 @@ class BoUpSLP {
     return VectorizableTree.front()->Scalars;
   }
 
+  bool areAllEntriesIdentityOrdered() const {
+    return all_of(VectorizableTree,
+                  [&](auto &Entry) { return Entry->ReorderIndices.empty(); });
+  }
+
   /// Returns the type/is-signed info for the root node in the graph without
   /// casting.
   std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
@@ -21774,6 +21784,8 @@ class HorizontalReduction {
   /// signedness.
   SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
 
+  SmallVector<Value *, 2> InitialFAddValues;
+
   static bool isCmpSelMinMax(Instruction *I) {
     return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
            RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
@@ -21787,6 +21799,14 @@ class HorizontalReduction {
            (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
   }
 
+  bool isOrderedFaddReduction() const {
+    if (!isa<Instruction>(ReductionRoot))
+      return false;
+    auto *I = cast<Instruction>(ReductionRoot);
+    return (RdxKind == RecurKind::FAdd) &&
+           !I->getFastMathFlags().allowReassoc();
+  }
+
   /// Checks if instruction is associative and can be vectorized.
   static bool isVectorizable(RecurKind Kind, Instruction *I) {
     if (Kind == RecurKind::None)
@@ -21807,6 +21827,9 @@ class HorizontalReduction {
     if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
       return true;
 
+    if (Kind == RecurKind::FAdd && SLPEnableOrderedFPReductions)
+      return true;
+
     return I->isAssociative();
   }
 
@@ -22066,6 +22089,37 @@ class HorizontalReduction {
            (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
   }
 
+  bool checkOperandsOrder() const {
+    auto OpsVec = reverse(ReductionOps[0]);
+    if (!isOrderedFaddReduction() || empty(OpsVec))
+      return false;
+    Value *PrevOperand = *OpsVec.begin();
+    for (auto *I : drop_begin(OpsVec)) {
+      Value *Op1 = cast<BinaryOperator>(I)->getOperand(0);
+      if (Op1 != PrevOperand)
+        return false;
+      PrevOperand = I;
+    }
+    return true;
+  }
+
+  bool checkFastMathFlags() const {
+    for (auto OpsVec : ReductionOps) {
+      if (OpsVec.size() <= 1)
+        continue;
+      Value *V = *OpsVec.begin();
+      if (!isa<FPMathOperator>(V))
+        continue;
+      bool Flag = cast<Instruction>(V)->getFastMathFlags().allowReassoc();
+      auto It = find_if(drop_begin(OpsVec), [&](Value *I) {
+        auto CurFlag = cast<Instruction>(I)->getFastMathFlags().allowReassoc();
+        return (Flag != CurFlag);
+      });
+      if (It != OpsVec.end())
+        return false;
+    }
+    return true;
+  }
 public:
   HorizontalReduction() = default;
 
@@ -22180,9 +22234,10 @@ class HorizontalReduction {
       // Add reduction values. The values are sorted for better vectorization
       // results.
       for (Value *V : PossibleRedVals) {
-        size_t Key, Idx;
-        std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
-                                               /*AllowAlternate=*/false);
+        size_t Key = 0, Idx = 0;
+        if (!isOrderedFaddReduction())
+          std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
+                                                /*AllowAlternate=*/false);
         ++PossibleReducedVals[Key][Idx]
               .insert(std::make_pair(V, 0))
               .first->second;
@@ -22200,13 +22255,15 @@ class HorizontalReduction {
            It != E; ++It) {
         PossibleRedValsVect.emplace_back();
         auto RedValsVect = It->second.takeVector();
-        stable_sort(RedValsVect, llvm::less_second());
+        if (!isOrderedFaddReduction())
+          stable_sort(RedValsVect, llvm::less_second());
         for (const std::pair<Value *, unsigned> &Data : RedValsVect)
           PossibleRedValsVect.back().append(Data.second, Data.first);
       }
-      stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
-        return P1.size() > P2.size();
-      });
+      if (!isOrderedFaddReduction())
+        stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
+          return P1.size() > P2.size();
+        });
       int NewIdx = -1;
       for (ArrayRef<Value *> Data : PossibleRedValsVect) {
         if (NewIdx < 0 ||
@@ -22226,9 +22283,19 @@ class HorizontalReduction {
     }
     // Sort the reduced values by number of same/alternate opcode and/or pointer
     // operand.
-    stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
-      return P1.size() > P2.size();
-    });
+    if (!isOrderedFaddReduction())
+      stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
+        return P1.size() > P2.size();
+      });
+
+    if (isOrderedFaddReduction() &&
+        (ReducedVals.size() != 1 || ReducedVals[0].size() == 2 ||
+         !checkOperandsOrder()))
+      return false;
+
+    if (!checkFastMathFlags())
+      return false;
+
     return true;
   }
 
@@ -22423,7 +22490,7 @@ class HorizontalReduction {
       // original scalar identity operations on matched horizontal reductions).
       IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
                                     RdxKind != RecurKind::FMul &&
-                                    RdxKind != RecurKind::FMulAdd;
+                                    RdxKind != RecurKind::FMulAdd && !isOrderedFaddReduction();
       // Gather same values.
       SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
       if (IsSupportedHorRdxIdentityOp)
@@ -22524,6 +22591,8 @@ class HorizontalReduction {
         return IsAnyRedOpGathered;
       };
       bool AnyVectorized = false;
+      Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);;
+      Instruction *InsertPt = RdxRootInst;
       SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
       while (Pos < NumReducedVals - ReduxWidth + 1 &&
              ReduxWidth >= ReductionLimit) {
@@ -22684,8 +22753,6 @@ class HorizontalReduction {
 
         // Emit a reduction. If the root is a select (min/max idiom), the insert
         // point is the compare condition of that select.
-        Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
-        Instruction *InsertPt = RdxRootInst;
         if (IsCmpSelMinMax)
           InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
 
@@ -22738,6 +22805,41 @@ class HorizontalReduction {
           if (!V.isVectorized(RdxVal))
             RequiredExtract.insert(RdxVal);
         }
+
+        auto FirstIt = find_if(ReducedVals[0], [&](Value *RdxVal) {
+          return VectorizedVals.lookup(RdxVal);
+        });
+        auto LastIt = find_if(reverse(ReducedVals[0]), [&](Value *RdxVal) {
+          return VectorizedVals.lookup(RdxVal);
+        });
+        if (isOrderedFaddReduction()) {
+          //[FirstIt, LastIt] - range of vectorized Vals, we need it to get last
+          // non-vectorized Val at the beginning and it's ReductionOp and first
+          // non-vectorized Val at the end and it's ReductinoOp
+          // fadd - initial value for reduction
+          // fadd - v
+          // fadd - v
+          // fadd - v
+          // fadd - v
+          // fadd - scalar remainder
+          if (LastIt != ReducedVals[0].rend())
+            ReductionRoot =
+                cast<Instruction>(ReducedValsToOps.find(*LastIt)->second[0]);
+
+          if (InitialFAddValues.empty()) {
+            auto *FAddBinOp = cast<BinaryOperator>(
+                ReducedValsToOps.find(*FirstIt)->second[0]);
+            Value *InitialFAddValue = ConstantExpr::getBinOpIdentity(
+                FAddBinOp->getOpcode(), FAddBinOp->getType());
+            if (FirstIt != ReducedVals[0].end()) {
+              auto *Op1 = FAddBinOp->getOperand(0);
+              if (!isa<PoisonValue>(Op1))
+                InitialFAddValue = Op1;
+            }
+            InitialFAddValues.push_back(InitialFAddValue);
+          }
+        }
+
         Pos += ReduxWidth;
         Start = Pos;
         ReduxWidth = NumReducedVals - Pos;
@@ -22755,10 +22857,27 @@ class HorizontalReduction {
         continue;
       }
     }
-    if (!VectorValuesAndScales.empty())
-      VectorizedTree = GetNewVectorizedTree(
-          VectorizedTree,
-          emitReduction(Builder, *TTI, ReductionRoot->getType()));
+    if (!VectorValuesAndScales.empty()) {
+      if (!isOrderedFaddReduction()) {
+        VectorizedTree = GetNewVectorizedTree(
+            VectorizedTree,
+            emitReduction(Builder, *TTI, ReductionRoot->getType()));
+      } else {
+        for (auto V : VectorValuesAndScales) {
+          Value *InitialFAddValue = InitialFAddValues.back();
+          VectorizedTree = Builder.CreateFAddReduce(InitialFAddValue, std::get<0>(V));
+          InitialFAddValues.push_back(VectorizedTree);
+        }
+        auto LastIt = find_if(reverse(ReducedVals[0]), [&](Value *RdxVal) {
+          return VectorizedVals.lookup(RdxVal);
+        });
+        for_each(reverse(make_range(LastIt.base(), ReducedVals[0].end())),
+                   [&](Value *V) {
+                     ReducedValsToOps.find(V)->second[0]->moveAfter(
+                         cast<Instruction>(VectorizedTree));
+                   });
+      }
+    }
     if (VectorizedTree) {
       // Reorder operands of bool logical op in the natural order to avoid
       // possible problem with poison propagation. If not possible to reorder
@@ -22846,15 +22965,18 @@ class HorizontalReduction {
             ExtraReductions.emplace_back(RedOp, RdxVal);
         }
       }
-      // Iterate through all not-vectorized reduction values/extra arguments.
-      bool InitStep = true;
-      while (ExtraReductions.size() > 1) {
-        SmallVector<std::pair<Instruction *, Value *>> NewReds =
-            FinalGen(ExtraReductions, InitStep);
-        ExtraReductions.swap(NewReds);
-        InitStep = false;
+
+      if (!isOrderedFaddReduction()) {
+        // Iterate through all not-vectorized reduction values/extra arguments.
+        bool InitStep = true;
+        while (ExtraReductions.size() > 1) {
+          SmallVector<std::pair<Instruction *, Value *>> NewReds =
+              FinalGen(ExtraReductions, InitStep);
+          ExtraReductions.swap(NewReds);
+          InitStep = false;
+        }
+        VectorizedTree = ExtraReductions.front().second;
       }
-      VectorizedTree = ExtraReductions.front().second;
 
       ReductionRoot->replaceAllUsesWith(VectorizedTree);
 
@@ -22868,21 +22990,28 @@ class HorizontalReduction {
         IgnoreSet.insert_range(RdxOps);
 #endif
       for (ArrayRef<Value *> RdxOps : ReductionOps) {
+        SmallVector<Value *, 4> RdxOpsForDeletion;
         for (Value *Ignore : RdxOps) {
-          if (!Ignore)
+          if (!Ignore || (isOrderedFaddReduction() && !Ignore->use_empty() &&
+                          !any_of(cast<Instruction>(Ignore)->operands(),
+                                  [](const Value *Val) {
+                                    return isa<PoisonValue>(Val);
+                                  })))
             continue;
 #ifndef NDEBUG
           for (auto *U : Ignore->users()) {
-            assert(IgnoreSet.count(U) &&
-                   "All users must be either in the reduction ops list.");
+            assert((IgnoreSet.count(U) ||
+                   isOrderedFaddReduction()) &&
+                       "All users must be either in the reduction ops list.");
           }
 #endif
           if (!Ignore->use_empty()) {
             Value *P = PoisonValue::get(Ignore->getType());
             Ignore->replaceAllUsesWith(P);
           }
+          RdxOpsForDeletion.push_back(Ignore);
         }
-        V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
+        V.removeInstructionsAndOperands(ArrayRef(RdxOpsForDeletion), VectorValuesAndScales);
       }
     } else if (!CheckForReusedReductionOps) {
       for (ReductionOpsType &RdxOps : ReductionOps)
@@ -22961,6 +23090,8 @@ class HorizontalReduction {
           continue;
         }
         InstructionCost ScalarCost = 0;
+        if (RdxVal->use_empty())
+          continue;
         for (User *U : RdxVal->users()) {
           auto *RdxOp = cast<Instruction>(U);
           if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index f16c879c451c2..8f541a3dface3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -10,21 +10,10 @@
 
 define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
 ; CHECK-LABEL: @dot4f64(
-; CHECK-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 2
-; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd double [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; CHECK-NEXT:    [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT:    [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[DOT0123:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP3]])
 ; CHECK-NEXT:    ret double [[DOT0123]]
 ;
   %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
@@ -53,21 +42,10 @@ define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %p
 
 define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
 ; CHECK-LABEL: @dot4f32(
-; CHECK-NEXT:    [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 2
-; CHECK-NEXT:    [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x float> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; CHECK-NEXT:    [[DOT01:%.*]] = fadd float [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT:    [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT:    [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP10]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[DOT0123:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
 ; CHECK-NEXT:    ret float [[DOT0123]]
 ;
   %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index 17ae33652b6d8..c1a0c293ef9b9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -136,44 +136,39 @@ for.end:                                          ; preds = %for.body
 define float @foo3(ptr nocapture readonly %A) #0 {
 ; CHECK-LABEL: @foo3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
-; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
 ; CHECK-NEXT:    [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 5, i32 1, i32 2, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i32 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
-; CHECK-NEXT:    [[TMP14:%.*]] = fmul <4 x float> [[TMP13]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP12]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 4>
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP18]], <2 x float> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP14:%.*]] = fmul <4 x float> [[TMP13]], <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00, float 1.000000e+01>
 ; CHECK-NEXT:    [[TMP15]] = fadd <4 x float> [[TMP3]], [[TMP14]]
+; CHECK-NEXT:    [[MUL25:%.*]] = fmul float [[TMP8]], 1.100000e+01
+; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL25]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP16]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[TMP15]], i32 0
-; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP17]]
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[TMP15]], i32 1
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[TMP15]], i32 2
-; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP19]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP15]], i32 3
-; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP20]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]])
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[TMP17]], [[ADD6]]
 ; CHECK-NEXT:    ret float [[ADD31]]
 ;
 entry:
@@ -237,19 +232,13 @@ define float @sort_phi_type(ptr nocapture readonly %A) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <4 x float> [ splat (float 1.000000e+01), [[ENTRY]] ], [ [[TMP2:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP2]] = fmul <4 x float> [[TMP1]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128
+; CHECK-NEXT:    [[TMP2]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP6]]
+; CHECK-NEXT:    [[ADD31:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP1]])
 ; CHECK-NEXT:    ret float [[ADD31]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/fadd-scalar-remainder.ll b/llvm/test/Transforms/SLPVectorizer/fadd-scalar-remainder.ll
new file mode 100644
index 0000000000000..46aba65eb1b29
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/fadd-scalar-remainder.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s
+
+define float @_Z3fooPi(ptr %a){
+; CHECK-LABEL: define float @_Z3fooPi(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw <8 x i32> [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = fdiv <8 x double> [[TMP2]], splat (double 1.000000e-01)
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_8]], align 4
+; CHECK-NEXT:    [[MUL_8:%.*]] = mul nsw i32 [[TMP4]], [[TMP4]]
+; CHECK-NEXT:    [[CONV_8:%.*]] = uitofp nneg i32 [[MUL_8]] to double
+; CHECK-NEXT:    [[DIV_8:%.*]] = fdiv double [[CONV_8]], 1.000000e-01
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 36
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_9]], align 4
+; CHECK-NEXT:    [[MUL_9:%.*]] = mul nsw i32 [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[CONV_9:%.*]] = uitofp nneg i32 [[MUL_9]] to double
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fdiv double [[CONV_9]], 1.000000e-01
+; CHECK-NEXT:    [[TMP7:%.*]] = call double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP3]])
+; CHECK-NEXT:    [[ADD_8:%.*]] = fadd double [[TMP7]], [[DIV_8]]
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd double [[ADD_8]], [[OP_RDX]]
+; CHECK-NEXT:    [[DIV4:%.*]] = fdiv double [[OP_RDX1]], 5.000000e+03
+; CHECK-NEXT:    [[SUB:%.*]] = fadd double [[DIV4]], -5.000000e+03
+; CHECK-NEXT:    [[CONV6:%.*]] = fptrunc double [[SUB]] to float
+; CHECK-NEXT:    ret float [[CONV6]]
+;
+entry:
+  %0 = load i32, ptr %a, align 4
+  %mul = mul nsw i32 %0, %0
+  %conv = uitofp nneg i32 %mul to double
+  %div = fdiv double %conv, 1.000000e-01
+  %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 4
+  %1 = load i32, ptr %arrayidx.1, align 4
+  %mul.1 = mul nsw i32 %1, %1
+  %conv.1 = uitofp nneg i32 %mul.1 to double
+  %div.1 = fdiv double %conv.1, 1.000000e-01
+  %add.1 = fadd double %div, %div.1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 8
+  %2 = load i32, ptr %arrayidx.2, align 4
+  %mul.2 = mul nsw i32 %2, %2
+  %conv.2 = uitofp nneg i32 %mul.2 to double
+  %div.2 = fdiv double %conv.2, 1.000000e-01
+  %add.2 = fadd double %add.1, %div.2
+  %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 12
+  %3 = load i32, ptr %arrayidx.3, align 4
+  %mul.3 = mul nsw i32 %3, %3
+  %conv.3 = uitofp nneg i32 %mul.3 to double
+  %div.3 = fdiv double %conv.3, 1.000000e-01
+  %add.3 = fadd double %add.2, %div.3
+  %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 16
+  %4 = load i32, ptr %arrayidx.4, align 4
+  %mul.4 = mul nsw i32 %4, %4
+  %conv.4 = uitofp nneg i32 %mul.4 to double
+  %div.4 = fdiv double %conv.4, 1.000000e-01
+  %add.4 = fadd double %add.3, %div.4
+  %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 20
+  %5 = load i32, ptr %arrayidx.5, align 4
+  %mul.5 = mul nsw i32 %5, %5
+  %conv.5 = uitofp nneg i32 %mul.5 to double
+  %div.5 = fdiv double %conv.5, 1.000000e-01
+  %add.5 = fadd double %add.4, %div.5
+  %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 24
+  %6 = load i32, ptr %arrayidx.6, align 4
+  %mul.6 = mul nsw i32 %6, %6
+  %conv.6 = uitofp nneg i32 %mul.6 to double
+  %div.6 = fdiv double %conv.6, 1.000000e-01
+  %add.6 = fadd double %add.5, %div.6
+  %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 28
+  %7 = load i32, ptr %arrayidx.7, align 4
+  %mul.7 = mul nsw i32 %7, %7
+  %conv.7 = uitofp nneg i32 %mul.7 to double
+  %div.7 = fdiv double %conv.7, 1.000000e-01
+  %add.7 = fadd double %add.6, %div.7
+  %arrayidx.8 = getelementptr inbounds i8, ptr %a, i64 32
+  %8 = load i32, ptr %arrayidx.8, align 4
+  %mul.8 = mul nsw i32 %8, %8
+  %conv.8 = uitofp nneg i32 %mul.8 to double
+  %div.8 = fdiv double %conv.8, 1.000000e-01
+  %add.8 = fadd double %add.7, %div.8
+  %arrayidx.9 = getelementptr inbounds i8, ptr %a, i64 36
+  %9 = load i32, ptr %arrayidx.9, align 4
+  %mul.9 = mul nsw i32 %9, %9
+  %conv.9 = uitofp nneg i32 %mul.9 to double
+  %div.9 = fdiv double %conv.9, 1.000000e-01
+  %add.9 = fadd double %add.8, %div.9
+  %div4 = fdiv double %add.9, 5.000000e+03
+  %sub = fadd double %div4, -5.000000e+03
+  %conv6 = fptrunc double %sub to float
+  ret float %conv6
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/fadd-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/fadd-vectorize.ll
new file mode 100644
index 0000000000000..356480bc11591
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/fadd-vectorize.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s
+
+define float @test_reduce(ptr %a) {
+; CHECK-LABEL: define float @test_reduce(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+entry:
+  %0 = load float, ptr %a, align 4
+  %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 4
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 8
+  %2 = load float, ptr %arrayidx2, align 4
+  %add3 = fadd float %add, %2
+  %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 12
+  %3 = load float, ptr %arrayidx4, align 4
+  %add5 = fadd float %add3, %3
+  ret float %add5
+}
+
+define float @test_no_reduce(ptr %a) {
+; CHECK-LABEL: define float @test_no_reduce(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 12
+; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd float [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd float [[ADD]], [[ADD4]]
+; CHECK-NEXT:    ret float [[ADD5]]
+;
+entry:
+  %0 = load float, ptr %a, align 4
+  %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 4
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 8
+  %2 = load float, ptr %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i8, ptr %a, i64 12
+  %3 = load float, ptr %arrayidx3, align 4
+  %add4 = fadd float %2, %3
+  %add5 = fadd float %add, %add4
+  ret float %add5
+}
+
+define float @test_reduce2(ptr %a, float %b) {
+; CHECK-LABEL: define float @test_reduce2(
+; CHECK-SAME: ptr [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]])
+; CHECK-NEXT:    [[ADDB:%.*]] = fadd float [[TMP1]], [[B]]
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+entry:
+  %0 = load float, ptr %a, align 4
+  %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 4
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 8
+  %2 = load float, ptr %arrayidx2, align 4
+  %add3 = fadd float %add, %2
+  %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 12
+  %3 = load float, ptr %arrayidx4, align 4
+  %add5 = fadd float %add3, %3
+  %addb = fadd float %add5, %b
+  ret float %add5
+}
+
+define float @test_reduce_multiple_use(ptr %a, float %b) {
+; CHECK-LABEL: define float @test_reduce_multiple_use(
+; CHECK-SAME: ptr [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT:    [[ADDC:%.*]] = fadd float [[B]], [[TMP1]]
+; CHECK-NEXT:    [[ADD6:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[ADDC]], <4 x float> [[TMP0]])
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd float [[ADD6]], [[B]]
+; CHECK-NEXT:    ret float [[OP_RDX1]]
+;
+entry:
+  %0 = load float, ptr %a, align 4
+  %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 4
+  %1 = load float, ptr %arrayidx1, align 4
+  %addc = fadd float %b, %0
+  %addb = fadd float %addc, %0
+  %add = fadd float %addb, %1
+  %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 8
+  %2 = load float, ptr %arrayidx2, align 4
+  %add3 = fadd float %add, %2
+  %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 12
+  %3 = load float, ptr %arrayidx4, align 4
+  %add5 = fadd float %add3, %3
+  %add6 = fadd float %add5, %b
+  ret float %add6
+}
+
+define double @test_reduce_multiple_reductions(ptr %freq, double %sum) {
+; CHECK-LABEL: define double @test_reduce_multiple_reductions(
+; CHECK-SAME: ptr [[FREQ:%.*]], double [[SUM:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x double>, ptr [[FREQ]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 256
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x double>, ptr [[ARRAYIDX_32]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_48:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 384
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x double>, ptr [[ARRAYIDX_48]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_56:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 448
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[ARRAYIDX_56]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_60:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 480
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[ARRAYIDX_60]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_61:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 488
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[ARRAYIDX_61]], align 8
+; CHECK-NEXT:    [[ARRAYIDX_62:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 496
+; CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[ARRAYIDX_62]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = call double @llvm.vector.reduce.fadd.v32f64(double -0.000000e+00, <32 x double> [[TMP0]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call double @llvm.vector.reduce.fadd.v16f64(double [[TMP7]], <16 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.vector.reduce.fadd.v8f64(double [[TMP8]], <8 x double> [[TMP2]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double [[TMP9]], <4 x double> [[TMP3]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd double [[TMP13]], [[TMP4]]
+; CHECK-NEXT:    [[ADD_61:%.*]] = fadd double [[OP_RDX]], [[TMP5]]
+; CHECK-NEXT:    [[ADD_62:%.*]] = fadd double [[ADD_61]], [[TMP6]]
+; CHECK-NEXT:    ret double [[ADD_62]]
+;
+entry:
+  %0 = load double, ptr %freq, align 8
+  %arrayidx.1 = getelementptr inbounds i8, ptr %freq, i64 8
+  %1 = load double, ptr %arrayidx.1, align 8
+  %add.1 = fadd double %0, %1
+  %arrayidx.2 = getelementptr inbounds i8, ptr %freq, i64 16
+  %2 = load double, ptr %arrayidx.2, align 8
+  %add.2 = fadd double %add.1, %2
+  %arrayidx.3 = getelementptr inbounds i8, ptr %freq, i64 24
+  %3 = load double, ptr %arrayidx.3, align 8
+  %add.3 = fadd double %add.2, %3
+  %arrayidx.4 = getelementptr inbounds i8, ptr %freq, i64 32
+  %4 = load double, ptr %arrayidx.4, align 8
+  %add.4 = fadd double %add.3, %4
+  %arrayidx.5 = getelementptr inbounds i8, ptr %freq, i64 40
+  %5 = load double, ptr %arrayidx.5, align 8
+  %add.5 = fadd double %add.4, %5
+  %arrayidx.6 = getelementptr inbounds i8, ptr %freq, i64 48
+  %6 = load double, ptr %arrayidx.6, align 8
+  %add.6 = fadd double %add.5, %6
+  %arrayidx.7 = getelementptr inbounds i8, ptr %freq, i64 56
+  %7 = load double, ptr %arrayidx.7, align 8
+  %add.7 = fadd double %add.6, %7
+  %arrayidx.8 = getelementptr inbounds i8, ptr %freq, i64 64
+  %8 = load double, ptr %arrayidx.8, align 8
+  %add.8 = fadd double %add.7, %8
+  %arrayidx.9 = getelementptr inbounds i8, ptr %freq, i64 72
+  %9 = load double, ptr %arrayidx.9, align 8
+  %add.9 = fadd double %add.8, %9
+  %arrayidx.10 = getelementptr inbounds i8, ptr %freq, i64 80
+  %10 = load double, ptr %arrayidx.10, align 8
+  %add.10 = fadd double %add.9, %10
+  %arrayidx.11 = getelementptr inbounds i8, ptr %freq, i64 88
+  %11 = load double, ptr %arrayidx.11, align 8
+  %add.11 = fadd double %add.10, %11
+  %arrayidx.12 = getelementptr inbounds i8, ptr %freq, i64 96
+  %12 = load double, ptr %arrayidx.12, align 8
+  %add.12 = fadd double %add.11, %12
+  %arrayidx.13 = getelementptr inbounds i8, ptr %freq, i64 104
+  %13 = load double, ptr %arrayidx.13, align 8
+  %add.13 = fadd double %add.12, %13
+  %arrayidx.14 = getelementptr inbounds i8, ptr %freq, i64 112
+  %14 = load double, ptr %arrayidx.14, align 8
+  %add.14 = fadd double %add.13, %14
+  %arrayidx.15 = getelementptr inbounds i8, ptr %freq, i64 120
+  %15 = load double, ptr %arrayidx.15, align 8
+  %add.15 = fadd double %add.14, %15
+  %arrayidx.16 = getelementptr inbounds i8, ptr %freq, i64 128
+  %16 = load double, ptr %arrayidx.16, align 8
+  %add.16 = fadd double %add.15, %16
+  %arrayidx.17 = getelementptr inbounds i8, ptr %freq, i64 136
+  %17 = load double, ptr %arrayidx.17, align 8
+  %add.17 = fadd double %add.16, %17
+  %arrayidx.18 = getelementptr inbounds i8, ptr %freq, i64 144
+  %18 = load double, ptr %arrayidx.18, align 8
+  %add.18 = fadd double %add.17, %18
+  %arrayidx.19 = getelementptr inbounds i8, ptr %freq, i64 152
+  %19 = load double, ptr %arrayidx.19, align 8
+  %add.19 = fadd double %add.18, %19
+  %arrayidx.20 = getelementptr inbounds i8, ptr %freq, i64 160
+  %20 = load double, ptr %arrayidx.20, align 8
+  %add.20 = fadd double %add.19, %20
+  %arrayidx.21 = getelementptr inbounds i8, ptr %freq, i64 168
+  %21 = load double, ptr %arrayidx.21, align 8
+  %add.21 = fadd double %add.20, %21
+  %arrayidx.22 = getelementptr inbounds i8, ptr %freq, i64 176
+  %22 = load double, ptr %arrayidx.22, align 8
+  %add.22 = fadd double %add.21, %22
+  %arrayidx.23 = getelementptr inbounds i8, ptr %freq, i64 184
+  %23 = load double, ptr %arrayidx.23, align 8
+  %add.23 = fadd double %add.22, %23
+  %arrayidx.24 = getelementptr inbounds i8, ptr %freq, i64 192
+  %24 = load double, ptr %arrayidx.24, align 8
+  %add.24 = fadd double %add.23, %24
+  %arrayidx.25 = getelementptr inbounds i8, ptr %freq, i64 200
+  %25 = load double, ptr %arrayidx.25, align 8
+  %add.25 = fadd double %add.24, %25
+  %arrayidx.26 = getelementptr inbounds i8, ptr %freq, i64 208
+  %26 = load double, ptr %arrayidx.26, align 8
+  %add.26 = fadd double %add.25, %26
+  %arrayidx.27 = getelementptr inbounds i8, ptr %freq, i64 216
+  %27 = load double, ptr %arrayidx.27, align 8
+  %add.27 = fadd double %add.26, %27
+  %arrayidx.28 = getelementptr inbounds i8, ptr %freq, i64 224
+  %28 = load double, ptr %arrayidx.28, align 8
+  %add.28 = fadd double %add.27, %28
+  %arrayidx.29 = getelementptr inbounds i8, ptr %freq, i64 232
+  %29 = load double, ptr %arrayidx.29, align 8
+  %add.29 = fadd double %add.28, %29
+  %arrayidx.30 = getelementptr inbounds i8, ptr %freq, i64 240
+  %30 = load double, ptr %arrayidx.30, align 8
+  %add.30 = fadd double %add.29, %30
+  %arrayidx.31 = getelementptr inbounds i8, ptr %freq, i64 248
+  %31 = load double, ptr %arrayidx.31, align 8
+  %add.31 = fadd double %add.30, %31
+  %arrayidx.32 = getelementptr inbounds i8, ptr %freq, i64 256
+  %32 = load double, ptr %arrayidx.32, align 8
+  %add.32 = fadd double %add.31, %32
+  %arrayidx.33 = getelementptr inbounds i8, ptr %freq, i64 264
+  %33 = load double, ptr %arrayidx.33, align 8
+  %add.33 = fadd double %add.32, %33
+  %arrayidx.34 = getelementptr inbounds i8, ptr %freq, i64 272
+  %34 = load double, ptr %arrayidx.34, align 8
+  %add.34 = fadd double %add.33, %34
+  %arrayidx.35 = getelementptr inbounds i8, ptr %freq, i64 280
+  %35 = load double, ptr %arrayidx.35, align 8
+  %add.35 = fadd double %add.34, %35
+  %arrayidx.36 = getelementptr inbounds i8, ptr %freq, i64 288
+  %36 = load double, ptr %arrayidx.36, align 8
+  %add.36 = fadd double %add.35, %36
+  %arrayidx.37 = getelementptr inbounds i8, ptr %freq, i64 296
+  %37 = load double, ptr %arrayidx.37, align 8
+  %add.37 = fadd double %add.36, %37
+  %arrayidx.38 = getelementptr inbounds i8, ptr %freq, i64 304
+  %38 = load double, ptr %arrayidx.38, align 8
+  %add.38 = fadd double %add.37, %38
+  %arrayidx.39 = getelementptr inbounds i8, ptr %freq, i64 312
+  %39 = load double, ptr %arrayidx.39, align 8
+  %add.39 = fadd double %add.38, %39
+  %arrayidx.40 = getelementptr inbounds i8, ptr %freq, i64 320
+  %40 = load double, ptr %arrayidx.40, align 8
+  %add.40 = fadd double %add.39, %40
+  %arrayidx.41 = getelementptr inbounds i8, ptr %freq, i64 328
+  %41 = load double, ptr %arrayidx.41, align 8
+  %add.41 = fadd double %add.40, %41
+  %arrayidx.42 = getelementptr inbounds i8, ptr %freq, i64 336
+  %42 = load double, ptr %arrayidx.42, align 8
+  %add.42 = fadd double %add.41, %42
+  %arrayidx.43 = getelementptr inbounds i8, ptr %freq, i64 344
+  %43 = load double, ptr %arrayidx.43, align 8
+  %add.43 = fadd double %add.42, %43
+  %arrayidx.44 = getelementptr inbounds i8, ptr %freq, i64 352
+  %44 = load double, ptr %arrayidx.44, align 8
+  %add.44 = fadd double %add.43, %44
+  %arrayidx.45 = getelementptr inbounds i8, ptr %freq, i64 360
+  %45 = load double, ptr %arrayidx.45, align 8
+  %add.45 = fadd double %add.44, %45
+  %arrayidx.46 = getelementptr inbounds i8, ptr %freq, i64 368
+  %46 = load double, ptr %arrayidx.46, align 8
+  %add.46 = fadd double %add.45, %46
+  %arrayidx.47 = getelementptr inbounds i8, ptr %freq, i64 376
+  %47 = load double, ptr %arrayidx.47, align 8
+  %add.47 = fadd double %add.46, %47
+  %arrayidx.48 = getelementptr inbounds i8, ptr %freq, i64 384
+  %48 = load double, ptr %arrayidx.48, align 8
+  %add.48 = fadd double %add.47, %48
+  %arrayidx.49 = getelementptr inbounds i8, ptr %freq, i64 392
+  %49 = load double, ptr %arrayidx.49, align 8
+  %add.49 = fadd double %add.48, %49
+  %arrayidx.50 = getelementptr inbounds i8, ptr %freq, i64 400
+  %50 = load double, ptr %arrayidx.50, align 8
+  %add.50 = fadd double %add.49, %50
+  %arrayidx.51 = getelementptr inbounds i8, ptr %freq, i64 408
+  %51 = load double, ptr %arrayidx.51, align 8
+  %add.51 = fadd double %add.50, %51
+  %arrayidx.52 = getelementptr inbounds i8, ptr %freq, i64 416
+  %52 = load double, ptr %arrayidx.52, align 8
+  %add.52 = fadd double %add.51, %52
+  %arrayidx.53 = getelementptr inbounds i8, ptr %freq, i64 424
+  %53 = load double, ptr %arrayidx.53, align 8
+  %add.53 = fadd double %add.52, %53
+  %arrayidx.54 = getelementptr inbounds i8, ptr %freq, i64 432
+  %54 = load double, ptr %arrayidx.54, align 8
+  %add.54 = fadd double %add.53, %54
+  %arrayidx.55 = getelementptr inbounds i8, ptr %freq, i64 440
+  %55 = load double, ptr %arrayidx.55, align 8
+  %add.55 = fadd double %add.54, %55
+  %arrayidx.56 = getelementptr inbounds i8, ptr %freq, i64 448
+  %56 = load double, ptr %arrayidx.56, align 8
+  %add.56 = fadd double %add.55, %56
+  %arrayidx.57 = getelementptr inbounds i8, ptr %freq, i64 456
+  %57 = load double, ptr %arrayidx.57, align 8
+  %add.57 = fadd double %add.56, %57
+  %arrayidx.58 = getelementptr inbounds i8, ptr %freq, i64 464
+  %58 = load double, ptr %arrayidx.58, align 8
+  %add.58 = fadd double %add.57, %58
+  %arrayidx.59 = getelementptr inbounds i8, ptr %freq, i64 472
+  %59 = load double, ptr %arrayidx.59, align 8
+  %add.59 = fadd double %add.58, %59
+  %arrayidx.60 = getelementptr inbounds i8, ptr %freq, i64 480
+  %60 = load double, ptr %arrayidx.60, align 8
+  %add.60 = fadd double %add.59, %60
+  %arrayidx.61 = getelementptr inbounds i8, ptr %freq, i64 488
+  %61 = load double, ptr %arrayidx.61, align 8
+  %add.61 = fadd double %add.60, %61
+  %arrayidx.62 = getelementptr inbounds i8, ptr %freq, i64 496
+  %62 = load double, ptr %arrayidx.62, align 8
+  %add.62 = fadd double %add.61, %62
+  ret double %add.62
+}

>From a081c414ebf26a364f860dcfe226740ff14020bb Mon Sep 17 00:00:00 2001
From: sc-cluzze <d.marakulin at syntacore.com>
Date: Wed, 9 Jul 2025 16:04:01 +0000
Subject: [PATCH 2/2] [SLP] Moved matching of associative reduction to a
 separate function

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 140 ++++++++++++------
 1 file changed, 91 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2c7929d91121f..999ed72dc9da1 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22120,6 +22120,37 @@ class HorizontalReduction {
     }
     return true;
   }
+
+  // Checks if the operands of the \p TreeN instruction are also reduction
+  // operations or should be treated as reduced values or an extra argument,
+  // which is not part of the reduction.
+  void CheckOperands(BoUpSLP &R, Instruction *TreeN, Instruction *Root, SmallVectorImpl<Value *> &PossibleReducedVals,
+                             SmallVectorImpl<Instruction *> &ReductionOps,
+                             unsigned Level) {
+      bool IsCmpSelMinMax = isCmpSelMinMax(Root);
+      for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
+                                    getNumberOfOperands(TreeN)))) {
+        Value *EdgeVal = getRdxOperand(TreeN, I);
+        ReducedValsToOps[EdgeVal].push_back(TreeN);
+        auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
+        // If the edge is not an instruction, or it is different from the main
+        // reduction opcode or has too many uses - possible reduced value.
+        // Also, do not try to reduce const values, if the operation is not
+        // foldable.
+        if (!EdgeInst || Level > RecursionMaxDepth ||
+            getRdxKind(EdgeInst) != RdxKind ||
+            IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
+            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
+            !isVectorizable(RdxKind, EdgeInst) ||
+            (R.isAnalyzedReductionRoot(EdgeInst) &&
+             all_of(EdgeInst->operands(), IsaPred<Constant>))) {
+          PossibleReducedVals.push_back(EdgeVal);
+          continue;
+        }
+        ReductionOps.push_back(EdgeInst);
+      }
+    };
+
 public:
   HorizontalReduction() = default;
 
@@ -22144,42 +22175,14 @@ class HorizontalReduction {
         return false;
 
     ReductionRoot = Root;
-
+    if (isOrderedFaddReduction())
+      return false;
     // Iterate through all the operands of the possible reduction tree and
     // gather all the reduced values, sorting them by their value id.
     BasicBlock *BB = Root->getParent();
     bool IsCmpSelMinMax = isCmpSelMinMax(Root);
     SmallVector<std::pair<Instruction *, unsigned>> Worklist(
         1, std::make_pair(Root, 0));
-    // Checks if the operands of the \p TreeN instruction are also reduction
-    // operations or should be treated as reduced values or an extra argument,
-    // which is not part of the reduction.
-    auto CheckOperands = [&](Instruction *TreeN,
-                             SmallVectorImpl<Value *> &PossibleReducedVals,
-                             SmallVectorImpl<Instruction *> &ReductionOps,
-                             unsigned Level) {
-      for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
-                                    getNumberOfOperands(TreeN)))) {
-        Value *EdgeVal = getRdxOperand(TreeN, I);
-        ReducedValsToOps[EdgeVal].push_back(TreeN);
-        auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
-        // If the edge is not an instruction, or it is different from the main
-        // reduction opcode or has too many uses - possible reduced value.
-        // Also, do not try to reduce const values, if the operation is not
-        // foldable.
-        if (!EdgeInst || Level > RecursionMaxDepth ||
-            getRdxKind(EdgeInst) != RdxKind ||
-            IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
-            !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
-            !isVectorizable(RdxKind, EdgeInst) ||
-            (R.isAnalyzedReductionRoot(EdgeInst) &&
-             all_of(EdgeInst->operands(), IsaPred<Constant>))) {
-          PossibleReducedVals.push_back(EdgeVal);
-          continue;
-        }
-        ReductionOps.push_back(EdgeInst);
-      }
-    };
     // Try to regroup reduced values so that it gets more profitable to try to
     // reduce them. Values are grouped by their value ids, instructions - by
     // instruction op id and/or alternate op id, plus do extra analysis for
@@ -22229,15 +22232,14 @@ class HorizontalReduction {
       auto [TreeN, Level] = Worklist.pop_back_val();
       SmallVector<Value *> PossibleRedVals;
       SmallVector<Instruction *> PossibleReductionOps;
-      CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
+      CheckOperands(R, TreeN, Root, PossibleRedVals, PossibleReductionOps, Level);
       addReductionOps(TreeN);
       // Add reduction values. The values are sorted for better vectorization
       // results.
       for (Value *V : PossibleRedVals) {
         size_t Key = 0, Idx = 0;
-        if (!isOrderedFaddReduction())
-          std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
-                                                /*AllowAlternate=*/false);
+        std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
+                                              /*AllowAlternate=*/false);
         ++PossibleReducedVals[Key][Idx]
               .insert(std::make_pair(V, 0))
               .first->second;
@@ -22255,15 +22257,13 @@ class HorizontalReduction {
            It != E; ++It) {
         PossibleRedValsVect.emplace_back();
         auto RedValsVect = It->second.takeVector();
-        if (!isOrderedFaddReduction())
-          stable_sort(RedValsVect, llvm::less_second());
+        stable_sort(RedValsVect, llvm::less_second());
         for (const std::pair<Value *, unsigned> &Data : RedValsVect)
           PossibleRedValsVect.back().append(Data.second, Data.first);
       }
-      if (!isOrderedFaddReduction())
-        stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
-          return P1.size() > P2.size();
-        });
+      stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
+        return P1.size() > P2.size();
+      });
       int NewIdx = -1;
       for (ArrayRef<Value *> Data : PossibleRedValsVect) {
         if (NewIdx < 0 ||
@@ -22283,17 +22283,59 @@ class HorizontalReduction {
     }
     // Sort the reduced values by number of same/alternate opcode and/or pointer
     // operand.
-    if (!isOrderedFaddReduction())
-      stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
-        return P1.size() > P2.size();
-      });
+    stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
+      return P1.size() > P2.size();
+    });
+
+    if (!checkFastMathFlags())
+      return false;
+
+    return true;
+  }
+
+  bool matchNonAssociativeReduction(BoUpSLP &R, Instruction *Root,
+                                  ScalarEvolution &SE, const DataLayout &DL,
+                                  const TargetLibraryInfo &TLI) {
+    RdxKind = HorizontalReduction::getRdxKind(Root);
+    if (!isVectorizable(RdxKind, Root))
+      return false;
 
-    if (isOrderedFaddReduction() &&
-        (ReducedVals.size() != 1 || ReducedVals[0].size() == 2 ||
-         !checkOperandsOrder()))
+    Type *Ty = Root->getType();
+    if (!isValidElementType(Ty) || Ty->isPointerTy())
       return false;
 
-    if (!checkFastMathFlags())
+    if (auto *Sel = dyn_cast<SelectInst>(Root))
+      if (!Sel->getCondition()->hasOneUse())
+        return false;
+
+    ReductionRoot = Root;
+    if (!isOrderedFaddReduction())
+      return false;
+
+    BasicBlock *BB = Root->getParent();
+    SmallVector<std::pair<Instruction *, unsigned>> Worklist(
+        1, std::make_pair(Root, 0));
+    initReductionOps(Root);
+    ReducedVals.resize(1);
+    SmallMapVector<Value *, size_t, 2> ReusedVals;
+    while (!Worklist.empty()) {
+      auto [TreeN, Level] = Worklist.pop_back_val();
+      SmallVector<Value *> PossibleRedVals;
+      SmallVector<Instruction *> PossibleReductionOps;
+      CheckOperands(R, TreeN, Root, PossibleRedVals, PossibleReductionOps, Level);
+      addReductionOps(TreeN);
+      for (Value *V : PossibleRedVals)
+        ++ReusedVals.insert(std::make_pair(V, 0)).first->second;
+
+      for (Instruction *I : reverse(PossibleReductionOps))
+        Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
+    }
+    for (std::pair<Value *, size_t> V : ReusedVals.takeVector())
+      ReducedVals[0].append(V.second, V.first);
+
+    std::reverse(ReducedVals[0].begin(), ReducedVals[0].end());
+
+    if (ReducedVals[0].size() == 2 || !checkOperandsOrder() || !checkFastMathFlags())
       return false;
 
     return true;
@@ -23811,7 +23853,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
     if (!isReductionCandidate(Inst))
       return nullptr;
     HorizontalReduction HorRdx;
-    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
+    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI) && !HorRdx.matchNonAssociativeReduction(R, Inst, *SE, *DL, *TLI))
       return nullptr;
     return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
   };