[llvm] [SLP] Support ordered FAdd reductions in SLPVectorizer (PR #146570)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 9 09:08:34 PDT 2025
https://github.com/sc-clulzze updated https://github.com/llvm/llvm-project/pull/146570
>From dc56eb5cc2a3b75b6f7c06068112a0124dddd7bf Mon Sep 17 00:00:00 2001
From: sc-cluzze <d.marakulin at syntacore.com>
Date: Tue, 1 Jul 2025 16:19:06 +0000
Subject: [PATCH 1/2] [SLP] Support ordered FAdd reductions in slp-vectorizer
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 189 ++++++++--
.../SLPVectorizer/X86/dot-product.ll | 38 +--
llvm/test/Transforms/SLPVectorizer/X86/phi.ll | 53 ++-
.../SLPVectorizer/fadd-scalar-remainder.ll | 93 +++++
.../SLPVectorizer/fadd-vectorize.ll | 323 ++++++++++++++++++
5 files changed, 605 insertions(+), 91 deletions(-)
create mode 100644 llvm/test/Transforms/SLPVectorizer/fadd-scalar-remainder.ll
create mode 100644 llvm/test/Transforms/SLPVectorizer/fadd-vectorize.ll
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0941bf61953f1..2c7929d91121f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -110,11 +110,16 @@ using namespace std::placeholders;
#define SV_NAME "slp-vectorizer"
#define DEBUG_TYPE "SLP"
+STATISTIC(NumFaddVectorized, "Number of vectorized fadd reductions");
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
"Controls which SLP graphs should be vectorized.");
+static cl::opt<bool> SLPEnableOrderedFPReductions(
+ "slp-ordered-fp-reds", cl::init(true), cl::Hidden,
+ cl::desc("Enable vectorization of ordered floating point reductions"));
+
static cl::opt<bool>
RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
cl::desc("Run the SLP vectorization passes"));
@@ -1850,6 +1855,11 @@ class BoUpSLP {
return VectorizableTree.front()->Scalars;
}
+ bool areAllEntriesIdentityOrdered() const {
+ return all_of(VectorizableTree,
+ [&](auto &Entry) { return Entry->ReorderIndices.empty(); });
+ }
+
/// Returns the type/is-signed info for the root node in the graph without
/// casting.
std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
@@ -21774,6 +21784,8 @@ class HorizontalReduction {
/// signedness.
SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
+ SmallVector<Value *, 2> InitialFAddValues;
+
static bool isCmpSelMinMax(Instruction *I) {
return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
@@ -21787,6 +21799,14 @@ class HorizontalReduction {
(match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
}
+ bool isOrderedFaddReduction() const {
+ if (!isa<Instruction>(ReductionRoot))
+ return false;
+ auto *I = cast<Instruction>(ReductionRoot);
+ return (RdxKind == RecurKind::FAdd) &&
+ !I->getFastMathFlags().allowReassoc();
+ }
+
/// Checks if instruction is associative and can be vectorized.
static bool isVectorizable(RecurKind Kind, Instruction *I) {
if (Kind == RecurKind::None)
@@ -21807,6 +21827,9 @@ class HorizontalReduction {
if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
return true;
+ if (Kind == RecurKind::FAdd && SLPEnableOrderedFPReductions)
+ return true;
+
return I->isAssociative();
}
@@ -22066,6 +22089,37 @@ class HorizontalReduction {
(I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
}
+ bool checkOperandsOrder() const {
+ auto OpsVec = reverse(ReductionOps[0]);
+ if (!isOrderedFaddReduction() || empty(OpsVec))
+ return false;
+ Value *PrevOperand = *OpsVec.begin();
+ for (auto *I : drop_begin(OpsVec)) {
+ Value *Op1 = cast<BinaryOperator>(I)->getOperand(0);
+ if (Op1 != PrevOperand)
+ return false;
+ PrevOperand = I;
+ }
+ return true;
+ }
+
+ bool checkFastMathFlags() const {
+ for (auto OpsVec : ReductionOps) {
+ if (OpsVec.size() <= 1)
+ continue;
+ Value *V = *OpsVec.begin();
+ if (!isa<FPMathOperator>(V))
+ continue;
+ bool Flag = cast<Instruction>(V)->getFastMathFlags().allowReassoc();
+ auto It = find_if(drop_begin(OpsVec), [&](Value *I) {
+ auto CurFlag = cast<Instruction>(I)->getFastMathFlags().allowReassoc();
+ return (Flag != CurFlag);
+ });
+ if (It != OpsVec.end())
+ return false;
+ }
+ return true;
+ }
public:
HorizontalReduction() = default;
@@ -22180,9 +22234,10 @@ class HorizontalReduction {
// Add reduction values. The values are sorted for better vectorization
// results.
for (Value *V : PossibleRedVals) {
- size_t Key, Idx;
- std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
- /*AllowAlternate=*/false);
+ size_t Key = 0, Idx = 0;
+ if (!isOrderedFaddReduction())
+ std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
+ /*AllowAlternate=*/false);
++PossibleReducedVals[Key][Idx]
.insert(std::make_pair(V, 0))
.first->second;
@@ -22200,13 +22255,15 @@ class HorizontalReduction {
It != E; ++It) {
PossibleRedValsVect.emplace_back();
auto RedValsVect = It->second.takeVector();
- stable_sort(RedValsVect, llvm::less_second());
+ if (!isOrderedFaddReduction())
+ stable_sort(RedValsVect, llvm::less_second());
for (const std::pair<Value *, unsigned> &Data : RedValsVect)
PossibleRedValsVect.back().append(Data.second, Data.first);
}
- stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
- return P1.size() > P2.size();
- });
+ if (!isOrderedFaddReduction())
+ stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
+ return P1.size() > P2.size();
+ });
int NewIdx = -1;
for (ArrayRef<Value *> Data : PossibleRedValsVect) {
if (NewIdx < 0 ||
@@ -22226,9 +22283,19 @@ class HorizontalReduction {
}
// Sort the reduced values by number of same/alternate opcode and/or pointer
// operand.
- stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
- return P1.size() > P2.size();
- });
+ if (!isOrderedFaddReduction())
+ stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
+ return P1.size() > P2.size();
+ });
+
+ if (isOrderedFaddReduction() &&
+ (ReducedVals.size() != 1 || ReducedVals[0].size() == 2 ||
+ !checkOperandsOrder()))
+ return false;
+
+ if (!checkFastMathFlags())
+ return false;
+
return true;
}
@@ -22423,7 +22490,7 @@ class HorizontalReduction {
// original scalar identity operations on matched horizontal reductions).
IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
RdxKind != RecurKind::FMul &&
- RdxKind != RecurKind::FMulAdd;
+ RdxKind != RecurKind::FMulAdd && !isOrderedFaddReduction();
// Gather same values.
SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
if (IsSupportedHorRdxIdentityOp)
@@ -22524,6 +22591,8 @@ class HorizontalReduction {
return IsAnyRedOpGathered;
};
bool AnyVectorized = false;
+ Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);;
+ Instruction *InsertPt = RdxRootInst;
SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
while (Pos < NumReducedVals - ReduxWidth + 1 &&
ReduxWidth >= ReductionLimit) {
@@ -22684,8 +22753,6 @@ class HorizontalReduction {
// Emit a reduction. If the root is a select (min/max idiom), the insert
// point is the compare condition of that select.
- Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
- Instruction *InsertPt = RdxRootInst;
if (IsCmpSelMinMax)
InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
@@ -22738,6 +22805,41 @@ class HorizontalReduction {
if (!V.isVectorized(RdxVal))
RequiredExtract.insert(RdxVal);
}
+
+ auto FirstIt = find_if(ReducedVals[0], [&](Value *RdxVal) {
+ return VectorizedVals.lookup(RdxVal);
+ });
+ auto LastIt = find_if(reverse(ReducedVals[0]), [&](Value *RdxVal) {
+ return VectorizedVals.lookup(RdxVal);
+ });
+ if (isOrderedFaddReduction()) {
+ //[FirstIt, LastIt] - range of vectorized Vals, we need it to get last
+ // non-vectorized Val at the beginning and it's ReductionOp and first
+ // non-vectorized Val at the end and it's ReductinoOp
+ // fadd - initial value for reduction
+ // fadd - v
+ // fadd - v
+ // fadd - v
+ // fadd - v
+ // fadd - scalar remainder
+ if (LastIt != ReducedVals[0].rend())
+ ReductionRoot =
+ cast<Instruction>(ReducedValsToOps.find(*LastIt)->second[0]);
+
+ if (InitialFAddValues.empty()) {
+ auto *FAddBinOp = cast<BinaryOperator>(
+ ReducedValsToOps.find(*FirstIt)->second[0]);
+ Value *InitialFAddValue = ConstantExpr::getBinOpIdentity(
+ FAddBinOp->getOpcode(), FAddBinOp->getType());
+ if (FirstIt != ReducedVals[0].end()) {
+ auto *Op1 = FAddBinOp->getOperand(0);
+ if (!isa<PoisonValue>(Op1))
+ InitialFAddValue = Op1;
+ }
+ InitialFAddValues.push_back(InitialFAddValue);
+ }
+ }
+
Pos += ReduxWidth;
Start = Pos;
ReduxWidth = NumReducedVals - Pos;
@@ -22755,10 +22857,27 @@ class HorizontalReduction {
continue;
}
}
- if (!VectorValuesAndScales.empty())
- VectorizedTree = GetNewVectorizedTree(
- VectorizedTree,
- emitReduction(Builder, *TTI, ReductionRoot->getType()));
+ if (!VectorValuesAndScales.empty()) {
+ if (!isOrderedFaddReduction()) {
+ VectorizedTree = GetNewVectorizedTree(
+ VectorizedTree,
+ emitReduction(Builder, *TTI, ReductionRoot->getType()));
+ } else {
+ for (auto V : VectorValuesAndScales) {
+ Value *InitialFAddValue = InitialFAddValues.back();
+ VectorizedTree = Builder.CreateFAddReduce(InitialFAddValue, std::get<0>(V));
+ InitialFAddValues.push_back(VectorizedTree);
+ }
+ auto LastIt = find_if(reverse(ReducedVals[0]), [&](Value *RdxVal) {
+ return VectorizedVals.lookup(RdxVal);
+ });
+ for_each(reverse(make_range(LastIt.base(), ReducedVals[0].end())),
+ [&](Value *V) {
+ ReducedValsToOps.find(V)->second[0]->moveAfter(
+ cast<Instruction>(VectorizedTree));
+ });
+ }
+ }
if (VectorizedTree) {
// Reorder operands of bool logical op in the natural order to avoid
// possible problem with poison propagation. If not possible to reorder
@@ -22846,15 +22965,18 @@ class HorizontalReduction {
ExtraReductions.emplace_back(RedOp, RdxVal);
}
}
- // Iterate through all not-vectorized reduction values/extra arguments.
- bool InitStep = true;
- while (ExtraReductions.size() > 1) {
- SmallVector<std::pair<Instruction *, Value *>> NewReds =
- FinalGen(ExtraReductions, InitStep);
- ExtraReductions.swap(NewReds);
- InitStep = false;
+
+ if (!isOrderedFaddReduction()) {
+ // Iterate through all not-vectorized reduction values/extra arguments.
+ bool InitStep = true;
+ while (ExtraReductions.size() > 1) {
+ SmallVector<std::pair<Instruction *, Value *>> NewReds =
+ FinalGen(ExtraReductions, InitStep);
+ ExtraReductions.swap(NewReds);
+ InitStep = false;
+ }
+ VectorizedTree = ExtraReductions.front().second;
}
- VectorizedTree = ExtraReductions.front().second;
ReductionRoot->replaceAllUsesWith(VectorizedTree);
@@ -22868,21 +22990,28 @@ class HorizontalReduction {
IgnoreSet.insert_range(RdxOps);
#endif
for (ArrayRef<Value *> RdxOps : ReductionOps) {
+ SmallVector<Value *, 4> RdxOpsForDeletion;
for (Value *Ignore : RdxOps) {
- if (!Ignore)
+ if (!Ignore || (isOrderedFaddReduction() && !Ignore->use_empty() &&
+ !any_of(cast<Instruction>(Ignore)->operands(),
+ [](const Value *Val) {
+ return isa<PoisonValue>(Val);
+ })))
continue;
#ifndef NDEBUG
for (auto *U : Ignore->users()) {
- assert(IgnoreSet.count(U) &&
- "All users must be either in the reduction ops list.");
+ assert((IgnoreSet.count(U) ||
+ isOrderedFaddReduction()) &&
+ "All users must be either in the reduction ops list.");
}
#endif
if (!Ignore->use_empty()) {
Value *P = PoisonValue::get(Ignore->getType());
Ignore->replaceAllUsesWith(P);
}
+ RdxOpsForDeletion.push_back(Ignore);
}
- V.removeInstructionsAndOperands(RdxOps, VectorValuesAndScales);
+ V.removeInstructionsAndOperands(ArrayRef(RdxOpsForDeletion), VectorValuesAndScales);
}
} else if (!CheckForReusedReductionOps) {
for (ReductionOpsType &RdxOps : ReductionOps)
@@ -22961,6 +23090,8 @@ class HorizontalReduction {
continue;
}
InstructionCost ScalarCost = 0;
+ if (RdxVal->use_empty())
+ continue;
for (User *U : RdxVal->users()) {
auto *RdxOp = cast<Instruction>(U);
if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index f16c879c451c2..8f541a3dface3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -10,21 +10,10 @@
define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
; CHECK-LABEL: @dot4f64(
-; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 2
-; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP4]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP7]], [[TMP8]]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
-; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
-; CHECK-NEXT: [[DOT0123:%.*]] = fadd double [[DOT012]], [[TMP10]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[DOT0123:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP3]])
; CHECK-NEXT: ret double [[DOT0123]]
;
%ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
@@ -53,21 +42,10 @@ define double @dot4f64(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %p
define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
; CHECK-LABEL: @dot4f32(
-; CHECK-NEXT: [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 2
-; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 2
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x float> [[TMP4]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP7]], [[TMP8]]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT: [[DOT0123:%.*]] = fadd float [[DOT012]], [[TMP10]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[DOT0123:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP3]])
; CHECK-NEXT: ret float [[DOT0123]]
;
%ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
index 17ae33652b6d8..c1a0c293ef9b9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -136,44 +136,39 @@ for.end: ; preds = %for.body
define float @foo3(ptr nocapture readonly %A) #0 {
; CHECK-LABEL: @foo3(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1:%.*]], align 4
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 4
+; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 0, i32 1>
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP15:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
-; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
-; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]]
; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP6]]
+; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX14]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX14]], align 4
+; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT: [[TMP11:%.*]] = add nsw i64 [[INDVARS_IV]], 4
+; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[ARRAYIDX1]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX24]], align 4
; CHECK-NEXT: [[TMP7]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> <i32 5, i32 1, i32 2, i32 poison>
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP9]], i32 2
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 5>
-; CHECK-NEXT: [[TMP14:%.*]] = fmul <4 x float> [[TMP13]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
+; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP12]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 4>
+; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP18]], <2 x float> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP14:%.*]] = fmul <4 x float> [[TMP13]], <float 7.000000e+00, float 8.000000e+00, float 9.000000e+00, float 1.000000e+01>
; CHECK-NEXT: [[TMP15]] = fadd <4 x float> [[TMP3]], [[TMP14]]
+; CHECK-NEXT: [[MUL25:%.*]] = fmul float [[TMP8]], 1.100000e+01
+; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL25]]
; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP16]], 121
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
; CHECK: for.end:
-; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP15]], i32 0
-; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP17]]
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP15]], i32 1
-; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP18]]
-; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP15]], i32 2
-; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP19]]
-; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP15]], i32 3
-; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP20]]
+; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]])
+; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[TMP17]], [[ADD6]]
; CHECK-NEXT: ret float [[ADD31]]
;
entry:
@@ -237,19 +232,13 @@ define float @sort_phi_type(ptr nocapture readonly %A) {
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x float> [ splat (float 1.000000e+01), [[ENTRY]] ], [ [[TMP2:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT: [[TMP2]] = fmul <4 x float> [[TMP1]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>
+; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[TMP0]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128
+; CHECK-NEXT: [[TMP2]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
; CHECK: for.end:
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
-; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[TMP3]], [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
-; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP5]]
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP6]]
+; CHECK-NEXT: [[ADD31:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP1]])
; CHECK-NEXT: ret float [[ADD31]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/fadd-scalar-remainder.ll b/llvm/test/Transforms/SLPVectorizer/fadd-scalar-remainder.ll
new file mode 100644
index 0000000000000..46aba65eb1b29
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/fadd-scalar-remainder.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s
+
+define float @_Z3fooPi(ptr %a){
+; CHECK-LABEL: define float @_Z3fooPi(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <8 x i32> [[TMP0]], [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x double>
+; CHECK-NEXT: [[TMP3:%.*]] = fdiv <8 x double> [[TMP2]], splat (double 1.000000e-01)
+; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_8]], align 4
+; CHECK-NEXT: [[MUL_8:%.*]] = mul nsw i32 [[TMP4]], [[TMP4]]
+; CHECK-NEXT: [[CONV_8:%.*]] = uitofp nneg i32 [[MUL_8]] to double
+; CHECK-NEXT: [[DIV_8:%.*]] = fdiv double [[CONV_8]], 1.000000e-01
+; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 36
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_9]], align 4
+; CHECK-NEXT: [[MUL_9:%.*]] = mul nsw i32 [[TMP5]], [[TMP5]]
+; CHECK-NEXT: [[CONV_9:%.*]] = uitofp nneg i32 [[MUL_9]] to double
+; CHECK-NEXT: [[OP_RDX:%.*]] = fdiv double [[CONV_9]], 1.000000e-01
+; CHECK-NEXT: [[TMP7:%.*]] = call double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP3]])
+; CHECK-NEXT: [[ADD_8:%.*]] = fadd double [[TMP7]], [[DIV_8]]
+; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd double [[ADD_8]], [[OP_RDX]]
+; CHECK-NEXT: [[DIV4:%.*]] = fdiv double [[OP_RDX1]], 5.000000e+03
+; CHECK-NEXT: [[SUB:%.*]] = fadd double [[DIV4]], -5.000000e+03
+; CHECK-NEXT: [[CONV6:%.*]] = fptrunc double [[SUB]] to float
+; CHECK-NEXT: ret float [[CONV6]]
+;
+entry:
+ %0 = load i32, ptr %a, align 4
+ %mul = mul nsw i32 %0, %0
+ %conv = uitofp nneg i32 %mul to double
+ %div = fdiv double %conv, 1.000000e-01
+ %arrayidx.1 = getelementptr inbounds i8, ptr %a, i64 4
+ %1 = load i32, ptr %arrayidx.1, align 4
+ %mul.1 = mul nsw i32 %1, %1
+ %conv.1 = uitofp nneg i32 %mul.1 to double
+ %div.1 = fdiv double %conv.1, 1.000000e-01
+ %add.1 = fadd double %div, %div.1
+ %arrayidx.2 = getelementptr inbounds i8, ptr %a, i64 8
+ %2 = load i32, ptr %arrayidx.2, align 4
+ %mul.2 = mul nsw i32 %2, %2
+ %conv.2 = uitofp nneg i32 %mul.2 to double
+ %div.2 = fdiv double %conv.2, 1.000000e-01
+ %add.2 = fadd double %add.1, %div.2
+ %arrayidx.3 = getelementptr inbounds i8, ptr %a, i64 12
+ %3 = load i32, ptr %arrayidx.3, align 4
+ %mul.3 = mul nsw i32 %3, %3
+ %conv.3 = uitofp nneg i32 %mul.3 to double
+ %div.3 = fdiv double %conv.3, 1.000000e-01
+ %add.3 = fadd double %add.2, %div.3
+ %arrayidx.4 = getelementptr inbounds i8, ptr %a, i64 16
+ %4 = load i32, ptr %arrayidx.4, align 4
+ %mul.4 = mul nsw i32 %4, %4
+ %conv.4 = uitofp nneg i32 %mul.4 to double
+ %div.4 = fdiv double %conv.4, 1.000000e-01
+ %add.4 = fadd double %add.3, %div.4
+ %arrayidx.5 = getelementptr inbounds i8, ptr %a, i64 20
+ %5 = load i32, ptr %arrayidx.5, align 4
+ %mul.5 = mul nsw i32 %5, %5
+ %conv.5 = uitofp nneg i32 %mul.5 to double
+ %div.5 = fdiv double %conv.5, 1.000000e-01
+ %add.5 = fadd double %add.4, %div.5
+ %arrayidx.6 = getelementptr inbounds i8, ptr %a, i64 24
+ %6 = load i32, ptr %arrayidx.6, align 4
+ %mul.6 = mul nsw i32 %6, %6
+ %conv.6 = uitofp nneg i32 %mul.6 to double
+ %div.6 = fdiv double %conv.6, 1.000000e-01
+ %add.6 = fadd double %add.5, %div.6
+ %arrayidx.7 = getelementptr inbounds i8, ptr %a, i64 28
+ %7 = load i32, ptr %arrayidx.7, align 4
+ %mul.7 = mul nsw i32 %7, %7
+ %conv.7 = uitofp nneg i32 %mul.7 to double
+ %div.7 = fdiv double %conv.7, 1.000000e-01
+ %add.7 = fadd double %add.6, %div.7
+ %arrayidx.8 = getelementptr inbounds i8, ptr %a, i64 32
+ %8 = load i32, ptr %arrayidx.8, align 4
+ %mul.8 = mul nsw i32 %8, %8
+ %conv.8 = uitofp nneg i32 %mul.8 to double
+ %div.8 = fdiv double %conv.8, 1.000000e-01
+ %add.8 = fadd double %add.7, %div.8
+ %arrayidx.9 = getelementptr inbounds i8, ptr %a, i64 36
+ %9 = load i32, ptr %arrayidx.9, align 4
+ %mul.9 = mul nsw i32 %9, %9
+ %conv.9 = uitofp nneg i32 %mul.9 to double
+ %div.9 = fdiv double %conv.9, 1.000000e-01
+ %add.9 = fadd double %add.8, %div.9
+ %div4 = fdiv double %add.9, 5.000000e+03
+ %sub = fadd double %div4, -5.000000e+03
+ %conv6 = fptrunc double %sub to float
+ ret float %conv6
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/fadd-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/fadd-vectorize.ll
new file mode 100644
index 0000000000000..356480bc11591
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/fadd-vectorize.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s
+
+define float @test_reduce(ptr %a) {
+; CHECK-LABEL: define float @test_reduce(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]])
+; CHECK-NEXT: ret float [[TMP1]]
+;
+entry:
+ %0 = load float, ptr %a, align 4
+ %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 4
+ %1 = load float, ptr %arrayidx1, align 4
+ %add = fadd float %0, %1
+ %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 8
+ %2 = load float, ptr %arrayidx2, align 4
+ %add3 = fadd float %add, %2
+ %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 12
+ %3 = load float, ptr %arrayidx4, align 4
+ %add5 = fadd float %add3, %3
+ ret float %add5
+}
+
+define float @test_no_reduce(ptr %a) {
+; CHECK-LABEL: define float @test_no_reduce(
+; CHECK-SAME: ptr [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8
+; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 12
+; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT: [[ADD4:%.*]] = fadd float [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[ADD]], [[ADD4]]
+; CHECK-NEXT: ret float [[ADD5]]
+;
+entry:
+ %0 = load float, ptr %a, align 4
+ %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 4
+ %1 = load float, ptr %arrayidx1, align 4
+ %add = fadd float %0, %1
+ %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 8
+ %2 = load float, ptr %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds i8, ptr %a, i64 12
+ %3 = load float, ptr %arrayidx3, align 4
+ %add4 = fadd float %2, %3
+ %add5 = fadd float %add, %add4
+ ret float %add5
+}
+
+define float @test_reduce2(ptr %a, float %b) {
+; CHECK-LABEL: define float @test_reduce2(
+; CHECK-SAME: ptr [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP0]])
+; CHECK-NEXT: [[ADDB:%.*]] = fadd float [[TMP1]], [[B]]
+; CHECK-NEXT: ret float [[TMP1]]
+;
+entry:
+ %0 = load float, ptr %a, align 4
+ %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 4
+ %1 = load float, ptr %arrayidx1, align 4
+ %add = fadd float %0, %1
+ %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 8
+ %2 = load float, ptr %arrayidx2, align 4
+ %add3 = fadd float %add, %2
+ %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 12
+ %3 = load float, ptr %arrayidx4, align 4
+ %add5 = fadd float %add3, %3
+ %addb = fadd float %add5, %b
+ ret float %add5
+}
+
+define float @test_reduce_multiple_use(ptr %a, float %b) {
+; CHECK-LABEL: define float @test_reduce_multiple_use(
+; CHECK-SAME: ptr [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT: [[ADDC:%.*]] = fadd float [[B]], [[TMP1]]
+; CHECK-NEXT: [[ADD6:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[ADDC]], <4 x float> [[TMP0]])
+; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd float [[ADD6]], [[B]]
+; CHECK-NEXT: ret float [[OP_RDX1]]
+;
+entry:
+ %0 = load float, ptr %a, align 4
+ %arrayidx1 = getelementptr inbounds i8, ptr %a, i64 4
+ %1 = load float, ptr %arrayidx1, align 4
+ %addc = fadd float %b, %0
+ %addb = fadd float %addc, %0
+ %add = fadd float %addb, %1
+ %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 8
+ %2 = load float, ptr %arrayidx2, align 4
+ %add3 = fadd float %add, %2
+ %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 12
+ %3 = load float, ptr %arrayidx4, align 4
+ %add5 = fadd float %add3, %3
+ %add6 = fadd float %add5, %b
+ ret float %add6
+}
+
+define double @test_reduce_multiple_reductions(ptr %freq, double %sum) {
+; CHECK-LABEL: define double @test_reduce_multiple_reductions(
+; CHECK-SAME: ptr [[FREQ:%.*]], double [[SUM:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <32 x double>, ptr [[FREQ]], align 8
+; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 256
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x double>, ptr [[ARRAYIDX_32]], align 8
+; CHECK-NEXT: [[ARRAYIDX_48:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 384
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x double>, ptr [[ARRAYIDX_48]], align 8
+; CHECK-NEXT: [[ARRAYIDX_56:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 448
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x double>, ptr [[ARRAYIDX_56]], align 8
+; CHECK-NEXT: [[ARRAYIDX_60:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 480
+; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX_60]], align 8
+; CHECK-NEXT: [[ARRAYIDX_61:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 488
+; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX_61]], align 8
+; CHECK-NEXT: [[ARRAYIDX_62:%.*]] = getelementptr inbounds i8, ptr [[FREQ]], i64 496
+; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[ARRAYIDX_62]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = call double @llvm.vector.reduce.fadd.v32f64(double -0.000000e+00, <32 x double> [[TMP0]])
+; CHECK-NEXT: [[TMP8:%.*]] = call double @llvm.vector.reduce.fadd.v16f64(double [[TMP7]], <16 x double> [[TMP1]])
+; CHECK-NEXT: [[TMP9:%.*]] = call double @llvm.vector.reduce.fadd.v8f64(double [[TMP8]], <8 x double> [[TMP2]])
+; CHECK-NEXT: [[TMP13:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double [[TMP9]], <4 x double> [[TMP3]])
+; CHECK-NEXT: [[OP_RDX:%.*]] = fadd double [[TMP13]], [[TMP4]]
+; CHECK-NEXT: [[ADD_61:%.*]] = fadd double [[OP_RDX]], [[TMP5]]
+; CHECK-NEXT: [[ADD_62:%.*]] = fadd double [[ADD_61]], [[TMP6]]
+; CHECK-NEXT: ret double [[ADD_62]]
+;
+entry:
+ %0 = load double, ptr %freq, align 8
+ %arrayidx.1 = getelementptr inbounds i8, ptr %freq, i64 8
+ %1 = load double, ptr %arrayidx.1, align 8
+ %add.1 = fadd double %0, %1
+ %arrayidx.2 = getelementptr inbounds i8, ptr %freq, i64 16
+ %2 = load double, ptr %arrayidx.2, align 8
+ %add.2 = fadd double %add.1, %2
+ %arrayidx.3 = getelementptr inbounds i8, ptr %freq, i64 24
+ %3 = load double, ptr %arrayidx.3, align 8
+ %add.3 = fadd double %add.2, %3
+ %arrayidx.4 = getelementptr inbounds i8, ptr %freq, i64 32
+ %4 = load double, ptr %arrayidx.4, align 8
+ %add.4 = fadd double %add.3, %4
+ %arrayidx.5 = getelementptr inbounds i8, ptr %freq, i64 40
+ %5 = load double, ptr %arrayidx.5, align 8
+ %add.5 = fadd double %add.4, %5
+ %arrayidx.6 = getelementptr inbounds i8, ptr %freq, i64 48
+ %6 = load double, ptr %arrayidx.6, align 8
+ %add.6 = fadd double %add.5, %6
+ %arrayidx.7 = getelementptr inbounds i8, ptr %freq, i64 56
+ %7 = load double, ptr %arrayidx.7, align 8
+ %add.7 = fadd double %add.6, %7
+ %arrayidx.8 = getelementptr inbounds i8, ptr %freq, i64 64
+ %8 = load double, ptr %arrayidx.8, align 8
+ %add.8 = fadd double %add.7, %8
+ %arrayidx.9 = getelementptr inbounds i8, ptr %freq, i64 72
+ %9 = load double, ptr %arrayidx.9, align 8
+ %add.9 = fadd double %add.8, %9
+ %arrayidx.10 = getelementptr inbounds i8, ptr %freq, i64 80
+ %10 = load double, ptr %arrayidx.10, align 8
+ %add.10 = fadd double %add.9, %10
+ %arrayidx.11 = getelementptr inbounds i8, ptr %freq, i64 88
+ %11 = load double, ptr %arrayidx.11, align 8
+ %add.11 = fadd double %add.10, %11
+ %arrayidx.12 = getelementptr inbounds i8, ptr %freq, i64 96
+ %12 = load double, ptr %arrayidx.12, align 8
+ %add.12 = fadd double %add.11, %12
+ %arrayidx.13 = getelementptr inbounds i8, ptr %freq, i64 104
+ %13 = load double, ptr %arrayidx.13, align 8
+ %add.13 = fadd double %add.12, %13
+ %arrayidx.14 = getelementptr inbounds i8, ptr %freq, i64 112
+ %14 = load double, ptr %arrayidx.14, align 8
+ %add.14 = fadd double %add.13, %14
+ %arrayidx.15 = getelementptr inbounds i8, ptr %freq, i64 120
+ %15 = load double, ptr %arrayidx.15, align 8
+ %add.15 = fadd double %add.14, %15
+ %arrayidx.16 = getelementptr inbounds i8, ptr %freq, i64 128
+ %16 = load double, ptr %arrayidx.16, align 8
+ %add.16 = fadd double %add.15, %16
+ %arrayidx.17 = getelementptr inbounds i8, ptr %freq, i64 136
+ %17 = load double, ptr %arrayidx.17, align 8
+ %add.17 = fadd double %add.16, %17
+ %arrayidx.18 = getelementptr inbounds i8, ptr %freq, i64 144
+ %18 = load double, ptr %arrayidx.18, align 8
+ %add.18 = fadd double %add.17, %18
+ %arrayidx.19 = getelementptr inbounds i8, ptr %freq, i64 152
+ %19 = load double, ptr %arrayidx.19, align 8
+ %add.19 = fadd double %add.18, %19
+ %arrayidx.20 = getelementptr inbounds i8, ptr %freq, i64 160
+ %20 = load double, ptr %arrayidx.20, align 8
+ %add.20 = fadd double %add.19, %20
+ %arrayidx.21 = getelementptr inbounds i8, ptr %freq, i64 168
+ %21 = load double, ptr %arrayidx.21, align 8
+ %add.21 = fadd double %add.20, %21
+ %arrayidx.22 = getelementptr inbounds i8, ptr %freq, i64 176
+ %22 = load double, ptr %arrayidx.22, align 8
+ %add.22 = fadd double %add.21, %22
+ %arrayidx.23 = getelementptr inbounds i8, ptr %freq, i64 184
+ %23 = load double, ptr %arrayidx.23, align 8
+ %add.23 = fadd double %add.22, %23
+ %arrayidx.24 = getelementptr inbounds i8, ptr %freq, i64 192
+ %24 = load double, ptr %arrayidx.24, align 8
+ %add.24 = fadd double %add.23, %24
+ %arrayidx.25 = getelementptr inbounds i8, ptr %freq, i64 200
+ %25 = load double, ptr %arrayidx.25, align 8
+ %add.25 = fadd double %add.24, %25
+ %arrayidx.26 = getelementptr inbounds i8, ptr %freq, i64 208
+ %26 = load double, ptr %arrayidx.26, align 8
+ %add.26 = fadd double %add.25, %26
+ %arrayidx.27 = getelementptr inbounds i8, ptr %freq, i64 216
+ %27 = load double, ptr %arrayidx.27, align 8
+ %add.27 = fadd double %add.26, %27
+ %arrayidx.28 = getelementptr inbounds i8, ptr %freq, i64 224
+ %28 = load double, ptr %arrayidx.28, align 8
+ %add.28 = fadd double %add.27, %28
+ %arrayidx.29 = getelementptr inbounds i8, ptr %freq, i64 232
+ %29 = load double, ptr %arrayidx.29, align 8
+ %add.29 = fadd double %add.28, %29
+ %arrayidx.30 = getelementptr inbounds i8, ptr %freq, i64 240
+ %30 = load double, ptr %arrayidx.30, align 8
+ %add.30 = fadd double %add.29, %30
+ %arrayidx.31 = getelementptr inbounds i8, ptr %freq, i64 248
+ %31 = load double, ptr %arrayidx.31, align 8
+ %add.31 = fadd double %add.30, %31
+ %arrayidx.32 = getelementptr inbounds i8, ptr %freq, i64 256
+ %32 = load double, ptr %arrayidx.32, align 8
+ %add.32 = fadd double %add.31, %32
+ %arrayidx.33 = getelementptr inbounds i8, ptr %freq, i64 264
+ %33 = load double, ptr %arrayidx.33, align 8
+ %add.33 = fadd double %add.32, %33
+ %arrayidx.34 = getelementptr inbounds i8, ptr %freq, i64 272
+ %34 = load double, ptr %arrayidx.34, align 8
+ %add.34 = fadd double %add.33, %34
+ %arrayidx.35 = getelementptr inbounds i8, ptr %freq, i64 280
+ %35 = load double, ptr %arrayidx.35, align 8
+ %add.35 = fadd double %add.34, %35
+ %arrayidx.36 = getelementptr inbounds i8, ptr %freq, i64 288
+ %36 = load double, ptr %arrayidx.36, align 8
+ %add.36 = fadd double %add.35, %36
+ %arrayidx.37 = getelementptr inbounds i8, ptr %freq, i64 296
+ %37 = load double, ptr %arrayidx.37, align 8
+ %add.37 = fadd double %add.36, %37
+ %arrayidx.38 = getelementptr inbounds i8, ptr %freq, i64 304
+ %38 = load double, ptr %arrayidx.38, align 8
+ %add.38 = fadd double %add.37, %38
+ %arrayidx.39 = getelementptr inbounds i8, ptr %freq, i64 312
+ %39 = load double, ptr %arrayidx.39, align 8
+ %add.39 = fadd double %add.38, %39
+ %arrayidx.40 = getelementptr inbounds i8, ptr %freq, i64 320
+ %40 = load double, ptr %arrayidx.40, align 8
+ %add.40 = fadd double %add.39, %40
+ %arrayidx.41 = getelementptr inbounds i8, ptr %freq, i64 328
+ %41 = load double, ptr %arrayidx.41, align 8
+ %add.41 = fadd double %add.40, %41
+ %arrayidx.42 = getelementptr inbounds i8, ptr %freq, i64 336
+ %42 = load double, ptr %arrayidx.42, align 8
+ %add.42 = fadd double %add.41, %42
+ %arrayidx.43 = getelementptr inbounds i8, ptr %freq, i64 344
+ %43 = load double, ptr %arrayidx.43, align 8
+ %add.43 = fadd double %add.42, %43
+ %arrayidx.44 = getelementptr inbounds i8, ptr %freq, i64 352
+ %44 = load double, ptr %arrayidx.44, align 8
+ %add.44 = fadd double %add.43, %44
+ %arrayidx.45 = getelementptr inbounds i8, ptr %freq, i64 360
+ %45 = load double, ptr %arrayidx.45, align 8
+ %add.45 = fadd double %add.44, %45
+ %arrayidx.46 = getelementptr inbounds i8, ptr %freq, i64 368
+ %46 = load double, ptr %arrayidx.46, align 8
+ %add.46 = fadd double %add.45, %46
+ %arrayidx.47 = getelementptr inbounds i8, ptr %freq, i64 376
+ %47 = load double, ptr %arrayidx.47, align 8
+ %add.47 = fadd double %add.46, %47
+ %arrayidx.48 = getelementptr inbounds i8, ptr %freq, i64 384
+ %48 = load double, ptr %arrayidx.48, align 8
+ %add.48 = fadd double %add.47, %48
+ %arrayidx.49 = getelementptr inbounds i8, ptr %freq, i64 392
+ %49 = load double, ptr %arrayidx.49, align 8
+ %add.49 = fadd double %add.48, %49
+ %arrayidx.50 = getelementptr inbounds i8, ptr %freq, i64 400
+ %50 = load double, ptr %arrayidx.50, align 8
+ %add.50 = fadd double %add.49, %50
+ %arrayidx.51 = getelementptr inbounds i8, ptr %freq, i64 408
+ %51 = load double, ptr %arrayidx.51, align 8
+ %add.51 = fadd double %add.50, %51
+ %arrayidx.52 = getelementptr inbounds i8, ptr %freq, i64 416
+ %52 = load double, ptr %arrayidx.52, align 8
+ %add.52 = fadd double %add.51, %52
+ %arrayidx.53 = getelementptr inbounds i8, ptr %freq, i64 424
+ %53 = load double, ptr %arrayidx.53, align 8
+ %add.53 = fadd double %add.52, %53
+ %arrayidx.54 = getelementptr inbounds i8, ptr %freq, i64 432
+ %54 = load double, ptr %arrayidx.54, align 8
+ %add.54 = fadd double %add.53, %54
+ %arrayidx.55 = getelementptr inbounds i8, ptr %freq, i64 440
+ %55 = load double, ptr %arrayidx.55, align 8
+ %add.55 = fadd double %add.54, %55
+ %arrayidx.56 = getelementptr inbounds i8, ptr %freq, i64 448
+ %56 = load double, ptr %arrayidx.56, align 8
+ %add.56 = fadd double %add.55, %56
+ %arrayidx.57 = getelementptr inbounds i8, ptr %freq, i64 456
+ %57 = load double, ptr %arrayidx.57, align 8
+ %add.57 = fadd double %add.56, %57
+ %arrayidx.58 = getelementptr inbounds i8, ptr %freq, i64 464
+ %58 = load double, ptr %arrayidx.58, align 8
+ %add.58 = fadd double %add.57, %58
+ %arrayidx.59 = getelementptr inbounds i8, ptr %freq, i64 472
+ %59 = load double, ptr %arrayidx.59, align 8
+ %add.59 = fadd double %add.58, %59
+ %arrayidx.60 = getelementptr inbounds i8, ptr %freq, i64 480
+ %60 = load double, ptr %arrayidx.60, align 8
+ %add.60 = fadd double %add.59, %60
+ %arrayidx.61 = getelementptr inbounds i8, ptr %freq, i64 488
+ %61 = load double, ptr %arrayidx.61, align 8
+ %add.61 = fadd double %add.60, %61
+ %arrayidx.62 = getelementptr inbounds i8, ptr %freq, i64 496
+ %62 = load double, ptr %arrayidx.62, align 8
+ %add.62 = fadd double %add.61, %62
+ ret double %add.62
+}
>From a081c414ebf26a364f860dcfe226740ff14020bb Mon Sep 17 00:00:00 2001
From: sc-cluzze <d.marakulin at syntacore.com>
Date: Wed, 9 Jul 2025 16:04:01 +0000
Subject: [PATCH 2/2] [SLP] Moved matching of associative reduction to a
separate function
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 140 ++++++++++++------
1 file changed, 91 insertions(+), 49 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 2c7929d91121f..999ed72dc9da1 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22120,6 +22120,37 @@ class HorizontalReduction {
}
return true;
}
+
+ // Checks if the operands of the \p TreeN instruction are also reduction
+ // operations or should be treated as reduced values or an extra argument,
+ // which is not part of the reduction.
+ void CheckOperands(BoUpSLP &R, Instruction *TreeN, Instruction *Root, SmallVectorImpl<Value *> &PossibleReducedVals,
+ SmallVectorImpl<Instruction *> &ReductionOps,
+ unsigned Level) {
+ bool IsCmpSelMinMax = isCmpSelMinMax(Root);
+ for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
+ getNumberOfOperands(TreeN)))) {
+ Value *EdgeVal = getRdxOperand(TreeN, I);
+ ReducedValsToOps[EdgeVal].push_back(TreeN);
+ auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
+ // If the edge is not an instruction, or it is different from the main
+ // reduction opcode or has too many uses - possible reduced value.
+ // Also, do not try to reduce const values, if the operation is not
+ // foldable.
+ if (!EdgeInst || Level > RecursionMaxDepth ||
+ getRdxKind(EdgeInst) != RdxKind ||
+ IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
+ !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
+ !isVectorizable(RdxKind, EdgeInst) ||
+ (R.isAnalyzedReductionRoot(EdgeInst) &&
+ all_of(EdgeInst->operands(), IsaPred<Constant>))) {
+ PossibleReducedVals.push_back(EdgeVal);
+ continue;
+ }
+ ReductionOps.push_back(EdgeInst);
+ }
+ };
+
public:
HorizontalReduction() = default;
@@ -22144,42 +22175,14 @@ class HorizontalReduction {
return false;
ReductionRoot = Root;
-
+ if (isOrderedFaddReduction())
+ return false;
// Iterate through all the operands of the possible reduction tree and
// gather all the reduced values, sorting them by their value id.
BasicBlock *BB = Root->getParent();
bool IsCmpSelMinMax = isCmpSelMinMax(Root);
SmallVector<std::pair<Instruction *, unsigned>> Worklist(
1, std::make_pair(Root, 0));
- // Checks if the operands of the \p TreeN instruction are also reduction
- // operations or should be treated as reduced values or an extra argument,
- // which is not part of the reduction.
- auto CheckOperands = [&](Instruction *TreeN,
- SmallVectorImpl<Value *> &PossibleReducedVals,
- SmallVectorImpl<Instruction *> &ReductionOps,
- unsigned Level) {
- for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
- getNumberOfOperands(TreeN)))) {
- Value *EdgeVal = getRdxOperand(TreeN, I);
- ReducedValsToOps[EdgeVal].push_back(TreeN);
- auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
- // If the edge is not an instruction, or it is different from the main
- // reduction opcode or has too many uses - possible reduced value.
- // Also, do not try to reduce const values, if the operation is not
- // foldable.
- if (!EdgeInst || Level > RecursionMaxDepth ||
- getRdxKind(EdgeInst) != RdxKind ||
- IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
- !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
- !isVectorizable(RdxKind, EdgeInst) ||
- (R.isAnalyzedReductionRoot(EdgeInst) &&
- all_of(EdgeInst->operands(), IsaPred<Constant>))) {
- PossibleReducedVals.push_back(EdgeVal);
- continue;
- }
- ReductionOps.push_back(EdgeInst);
- }
- };
// Try to regroup reduced values so that it gets more profitable to try to
// reduce them. Values are grouped by their value ids, instructions - by
// instruction op id and/or alternate op id, plus do extra analysis for
@@ -22229,15 +22232,14 @@ class HorizontalReduction {
auto [TreeN, Level] = Worklist.pop_back_val();
SmallVector<Value *> PossibleRedVals;
SmallVector<Instruction *> PossibleReductionOps;
- CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
+ CheckOperands(R, TreeN, Root, PossibleRedVals, PossibleReductionOps, Level);
addReductionOps(TreeN);
// Add reduction values. The values are sorted for better vectorization
// results.
for (Value *V : PossibleRedVals) {
size_t Key = 0, Idx = 0;
- if (!isOrderedFaddReduction())
- std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
- /*AllowAlternate=*/false);
+ std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
+ /*AllowAlternate=*/false);
++PossibleReducedVals[Key][Idx]
.insert(std::make_pair(V, 0))
.first->second;
@@ -22255,15 +22257,13 @@ class HorizontalReduction {
It != E; ++It) {
PossibleRedValsVect.emplace_back();
auto RedValsVect = It->second.takeVector();
- if (!isOrderedFaddReduction())
- stable_sort(RedValsVect, llvm::less_second());
+ stable_sort(RedValsVect, llvm::less_second());
for (const std::pair<Value *, unsigned> &Data : RedValsVect)
PossibleRedValsVect.back().append(Data.second, Data.first);
}
- if (!isOrderedFaddReduction())
- stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
- return P1.size() > P2.size();
- });
+ stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
+ return P1.size() > P2.size();
+ });
int NewIdx = -1;
for (ArrayRef<Value *> Data : PossibleRedValsVect) {
if (NewIdx < 0 ||
@@ -22283,17 +22283,59 @@ class HorizontalReduction {
}
// Sort the reduced values by number of same/alternate opcode and/or pointer
// operand.
- if (!isOrderedFaddReduction())
- stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
- return P1.size() > P2.size();
- });
+ stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
+ return P1.size() > P2.size();
+ });
+
+ if (!checkFastMathFlags())
+ return false;
+
+ return true;
+ }
+
+ bool matchNonAssociativeReduction(BoUpSLP &R, Instruction *Root,
+ ScalarEvolution &SE, const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
+ RdxKind = HorizontalReduction::getRdxKind(Root);
+ if (!isVectorizable(RdxKind, Root))
+ return false;
- if (isOrderedFaddReduction() &&
- (ReducedVals.size() != 1 || ReducedVals[0].size() == 2 ||
- !checkOperandsOrder()))
+ Type *Ty = Root->getType();
+ if (!isValidElementType(Ty) || Ty->isPointerTy())
return false;
- if (!checkFastMathFlags())
+ if (auto *Sel = dyn_cast<SelectInst>(Root))
+ if (!Sel->getCondition()->hasOneUse())
+ return false;
+
+ ReductionRoot = Root;
+ if (!isOrderedFaddReduction())
+ return false;
+
+ BasicBlock *BB = Root->getParent();
+ SmallVector<std::pair<Instruction *, unsigned>> Worklist(
+ 1, std::make_pair(Root, 0));
+ initReductionOps(Root);
+ ReducedVals.resize(1);
+ SmallMapVector<Value *, size_t, 2> ReusedVals;
+ while (!Worklist.empty()) {
+ auto [TreeN, Level] = Worklist.pop_back_val();
+ SmallVector<Value *> PossibleRedVals;
+ SmallVector<Instruction *> PossibleReductionOps;
+ CheckOperands(R, TreeN, Root, PossibleRedVals, PossibleReductionOps, Level);
+ addReductionOps(TreeN);
+ for (Value *V : PossibleRedVals)
+ ++ReusedVals.insert(std::make_pair(V, 0)).first->second;
+
+ for (Instruction *I : reverse(PossibleReductionOps))
+ Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
+ }
+ for (std::pair<Value *, size_t> V : ReusedVals.takeVector())
+ ReducedVals[0].append(V.second, V.first);
+
+ std::reverse(ReducedVals[0].begin(), ReducedVals[0].end());
+
+ if (ReducedVals[0].size() == 2 || !checkOperandsOrder() || !checkFastMathFlags())
return false;
return true;
@@ -23811,7 +23853,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
if (!isReductionCandidate(Inst))
return nullptr;
HorizontalReduction HorRdx;
- if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
+ if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI) && !HorRdx.matchNonAssociativeReduction(R, Inst, *SE, *DL, *TLI))
return nullptr;
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
};
More information about the llvm-commits
mailing list