[llvm] [SLP]Improved/fixed FMAD support in reductions (PR #152787)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 8 13:26:59 PDT 2025
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/152787
>From 611246c7328ac226ac3773eadf57f92c7ef648cc Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 8 Aug 2025 20:17:43 +0000
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
=?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 83 ++++++--
.../AArch64/reused-scalar-repeated-in-node.ll | 187 +++++++++---------
.../AArch64/scalarization-overhead.ll | 57 ++----
.../SLPVectorizer/RISCV/vec3-base.ll | 94 +++++----
.../SLPVectorizer/X86/dot-product.ll | 94 ++-------
.../SLPVectorizer/X86/horizontal-list.ll | 154 +++------------
.../X86/redux-feed-buildvector.ll | 69 ++-----
.../X86/redux-feed-insertelement.ll | 22 +--
.../SLPVectorizer/X86/slp-fma-loss.ll | 24 ++-
9 files changed, 308 insertions(+), 476 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ec06a217667c2..afdf73ab58184 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22730,21 +22730,11 @@ class HorizontalReduction {
/// Try to find a reduction tree.
bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
ScalarEvolution &SE, const DataLayout &DL,
- const TargetLibraryInfo &TLI,
- DominatorTree &DT, TargetTransformInfo &TTI) {
+ const TargetLibraryInfo &TLI) {
RdxKind = HorizontalReduction::getRdxKind(Root);
if (!isVectorizable(RdxKind, Root))
return false;
- // FMA reduction root - skip.
- auto CheckForFMA = [&](Instruction *I) {
- return RdxKind == RecurKind::FAdd &&
- canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI)
- .isValid();
- };
- if (CheckForFMA(Root))
- return false;
-
// Analyze "regular" integer/FP types for reductions - no target-specific
// types or pointers.
Type *Ty = Root->getType();
@@ -22782,7 +22772,7 @@ class HorizontalReduction {
// Also, do not try to reduce const values, if the operation is not
// foldable.
if (!EdgeInst || Level > RecursionMaxDepth ||
- getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) ||
+ getRdxKind(EdgeInst) != RdxKind ||
IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
!hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
!isVectorizable(RdxKind, EdgeInst) ||
@@ -22901,7 +22891,8 @@ class HorizontalReduction {
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
- const TargetLibraryInfo &TLI, AssumptionCache *AC) {
+ const TargetLibraryInfo &TLI, AssumptionCache *AC,
+ DominatorTree &DT) {
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
@@ -23302,7 +23293,7 @@ class HorizontalReduction {
// Estimate cost.
InstructionCost ReductionCost =
- getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
+ getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
<< " for reduction\n");
@@ -23607,7 +23598,9 @@ class HorizontalReduction {
InstructionCost getReductionCost(TargetTransformInfo *TTI,
ArrayRef<Value *> ReducedVals,
bool IsCmpSelMinMax, FastMathFlags FMF,
- const BoUpSLP &R) {
+ const BoUpSLP &R, DominatorTree &DT,
+ const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *ScalarTy = ReducedVals.front()->getType();
unsigned ReduxWidth = ReducedVals.size();
@@ -23632,6 +23625,22 @@ class HorizontalReduction {
for (User *U : RdxVal->users()) {
auto *RdxOp = cast<Instruction>(U);
if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
+ if (RdxKind == RecurKind::FAdd) {
+ InstructionCost FMACost = canConvertToFMA(
+ RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
+ if (FMACost.isValid()) {
+ LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
+ if (auto *I = dyn_cast<Instruction>(RdxVal)) {
+ // Also, exclude scalar fmul cost.
+ InstructionCost FMulCost =
+ TTI->getInstructionCost(I, CostKind);
+ LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
+ FMACost -= FMulCost;
+ }
+ ScalarCost += FMACost;
+ continue;
+ }
+ }
ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
continue;
}
@@ -23696,8 +23705,42 @@ class HorizontalReduction {
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
std::make_pair(RedTy, true));
VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
- VectorCost +=
- TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
+ InstructionCost FMACost = InstructionCost::getInvalid();
+ if (RdxKind == RecurKind::FAdd) {
+ // Check if the reduction operands can be converted to FMA.
+ SmallVector<Value *> Ops;
+ FastMathFlags FMF;
+ FMF.set();
+ for (Value *RdxVal : ReducedVals) {
+ if (!RdxVal->hasOneUse()) {
+ Ops.clear();
+ break;
+ }
+ if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
+ FMF &= FPCI->getFastMathFlags();
+ Ops.push_back(RdxVal->user_back());
+ }
+ FMACost = canConvertToFMA(
+ Ops, getSameOpcode(Ops, TLI), DT, DL, *TTI, TLI);
+ if (FMACost.isValid()) {
+ // Calculate actual FMAD cost.
+ IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
+ {RVecTy, RVecTy, RVecTy}, FMF);
+ FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
+
+ LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
+ // Also, exclude vector fmul cost.
+ InstructionCost FMulCost = TTI->getArithmeticInstrCost(
+ Instruction::FMul, RVecTy, CostKind);
+ LLVM_DEBUG(dbgs() << "Minus vector FMul cost: " << FMulCost << "\n");
+ FMACost -= FMulCost;
+ }
+ }
+ if (FMACost.isValid())
+ VectorCost += FMACost;
+ else
+ VectorCost +=
+ TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
if (RType != RedTy) {
unsigned Opcode = Instruction::Trunc;
if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
@@ -24357,9 +24400,9 @@ bool SLPVectorizerPass::vectorizeHorReduction(
if (!isReductionCandidate(Inst))
return nullptr;
HorizontalReduction HorRdx;
- if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI))
+ if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
return nullptr;
- return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
+ return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
};
auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -24504,7 +24547,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (RedCost >= ScalarCost)
return false;
- return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+ return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
};
if (Candidates.size() == 1)
return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index 2e684320ba10e..cca58d8d66f04 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -27,114 +27,111 @@ define void @test() {
; CHECK-NEXT: [[I76:%.*]] = load float, ptr poison, align 4
; CHECK-NEXT: br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]]
; CHECK: [[BB77]]:
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[I70]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[I68]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[I67]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[I69]], i32 1
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[I66]], i32 0
; CHECK-NEXT: br label %[[BB78:.*]]
; CHECK: [[BB78]]:
-; CHECK-NEXT: [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[I103:%.*]], %[[BB78]] ]
-; CHECK-NEXT: [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[I104:%.*]], %[[BB78]] ]
-; CHECK-NEXT: [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[I105:%.*]], %[[BB78]] ]
-; CHECK-NEXT: [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I106:%.*]], %[[BB78]] ]
-; CHECK-NEXT: [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I123:%.*]], %[[BB78]] ]
-; CHECK-NEXT: [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[I124:%.*]], %[[BB78]] ]
-; CHECK-NEXT: [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I125:%.*]], %[[BB78]] ]
-; CHECK-NEXT: [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[I126:%.*]], %[[BB78]] ]
-; CHECK-NEXT: [[I87:%.*]] = fmul fast float [[I85]], poison
-; CHECK-NEXT: [[I88:%.*]] = fmul fast float [[I80]], poison
-; CHECK-NEXT: [[I89:%.*]] = fmul fast float [[I81]], poison
-; CHECK-NEXT: [[I90:%.*]] = fmul fast float [[I82]], poison
-; CHECK-NEXT: [[I91:%.*]] = fmul fast float [[I84]], poison
-; CHECK-NEXT: [[I92:%.*]] = fadd fast float [[I91]], [[I87]]
-; CHECK-NEXT: [[I93:%.*]] = fmul fast float [[I127]], poison
-; CHECK-NEXT: [[I94:%.*]] = fadd fast float [[I93]], [[I88]]
-; CHECK-NEXT: [[I95:%.*]] = fmul fast float [[I131]], poison
-; CHECK-NEXT: [[I96:%.*]] = fadd fast float [[I95]], [[I89]]
-; CHECK-NEXT: [[I97:%.*]] = fmul fast float [[I86]], poison
-; CHECK-NEXT: [[I98:%.*]] = fadd fast float [[I97]], [[I90]]
-; CHECK-NEXT: [[I99:%.*]] = fadd fast float [[I92]], poison
-; CHECK-NEXT: [[I100:%.*]] = fadd fast float [[I94]], poison
-; CHECK-NEXT: [[I101:%.*]] = fadd fast float [[I96]], poison
-; CHECK-NEXT: [[I102:%.*]] = fadd fast float [[I98]], poison
-; CHECK-NEXT: [[I103]] = fadd fast float [[I99]], poison
-; CHECK-NEXT: [[I104]] = fadd fast float [[I100]], poison
-; CHECK-NEXT: [[I105]] = fadd fast float [[I101]], poison
-; CHECK-NEXT: [[I106]] = fadd fast float [[I102]], poison
-; CHECK-NEXT: [[I107:%.*]] = fmul fast float [[I85]], poison
-; CHECK-NEXT: [[I108:%.*]] = fmul fast float [[I80]], poison
-; CHECK-NEXT: [[I109:%.*]] = fmul fast float [[I81]], poison
-; CHECK-NEXT: [[I110:%.*]] = fmul fast float [[I82]], poison
-; CHECK-NEXT: [[I111:%.*]] = fmul fast float [[I84]], poison
-; CHECK-NEXT: [[I112:%.*]] = fadd fast float [[I111]], [[I107]]
-; CHECK-NEXT: [[I113:%.*]] = fmul fast float [[I127]], poison
-; CHECK-NEXT: [[I114:%.*]] = fadd fast float [[I113]], [[I108]]
-; CHECK-NEXT: [[I115:%.*]] = fmul fast float [[I131]], poison
-; CHECK-NEXT: [[I116:%.*]] = fadd fast float [[I115]], [[I109]]
-; CHECK-NEXT: [[I117:%.*]] = fmul fast float [[I86]], poison
-; CHECK-NEXT: [[I118:%.*]] = fadd fast float [[I117]], [[I110]]
-; CHECK-NEXT: [[I119:%.*]] = fadd fast float [[I112]], poison
-; CHECK-NEXT: [[I120:%.*]] = fadd fast float [[I114]], poison
-; CHECK-NEXT: [[I121:%.*]] = fadd fast float [[I116]], poison
-; CHECK-NEXT: [[I122:%.*]] = fadd fast float [[I118]], poison
-; CHECK-NEXT: [[I123]] = fadd fast float [[I119]], poison
-; CHECK-NEXT: [[I124]] = fadd fast float [[I120]], poison
-; CHECK-NEXT: [[I125]] = fadd fast float [[I121]], poison
-; CHECK-NEXT: [[I126]] = fadd fast float [[I122]], poison
+; CHECK-NEXT: [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
+; CHECK-NEXT: [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[TMP46:%.*]], %[[BB78]] ]
+; CHECK-NEXT: [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ]
+; CHECK-NEXT: [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP39:%.*]], %[[BB78]] ]
+; CHECK-NEXT: [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP30:%.*]], %[[BB78]] ]
+; CHECK-NEXT: [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[TMP53:%.*]], %[[BB78]] ]
+; CHECK-NEXT: [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
+; CHECK-NEXT: [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[TMP40:%.*]], %[[BB78]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP0]], %[[BB77]] ], [ [[TMP38:%.*]], %[[BB78]] ]
+; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP1]], %[[BB77]] ], [ [[TMP35:%.*]], %[[BB78]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP3]], %[[BB77]] ], [ [[TMP32:%.*]], %[[BB78]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x float> [ [[TMP4]], %[[BB77]] ], [ [[TMP29:%.*]], %[[BB78]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x float> [[TMP8]], poison
+; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x float> [[TMP7]], poison
+; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <2 x float> [[TMP6]], poison
+; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x float> [[TMP5]], poison
+; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[TMP8]], poison
+; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x float> [[TMP7]], poison
+; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP15]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <2 x float> [[TMP6]], poison
+; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP17]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP19:%.*]] = fmul fast <2 x float> [[TMP5]], poison
+; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP19]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT: [[TMP21:%.*]] = fadd fast <2 x float> [[TMP14]], [[TMP9]]
+; CHECK-NEXT: [[TMP22:%.*]] = fadd fast <2 x float> [[TMP21]], poison
+; CHECK-NEXT: [[TMP23:%.*]] = fadd fast <2 x float> [[TMP16]], [[TMP10]]
+; CHECK-NEXT: [[TMP24:%.*]] = fadd fast <2 x float> [[TMP23]], poison
+; CHECK-NEXT: [[TMP25:%.*]] = fadd fast <2 x float> [[TMP18]], [[TMP11]]
+; CHECK-NEXT: [[TMP26:%.*]] = fadd fast <2 x float> [[TMP25]], poison
+; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <2 x float> [[TMP20]], [[TMP12]]
+; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <2 x float> [[TMP27]], poison
+; CHECK-NEXT: [[TMP29]] = fadd fast <2 x float> [[TMP22]], poison
+; CHECK-NEXT: [[TMP30]] = extractelement <2 x float> [[TMP29]], i32 1
+; CHECK-NEXT: [[TMP31]] = extractelement <2 x float> [[TMP29]], i32 0
+; CHECK-NEXT: [[TMP32]] = fadd fast <2 x float> [[TMP24]], poison
+; CHECK-NEXT: [[TMP53]] = extractelement <2 x float> [[TMP32]], i32 1
+; CHECK-NEXT: [[TMP46]] = extractelement <2 x float> [[TMP32]], i32 0
+; CHECK-NEXT: [[TMP35]] = fadd fast <2 x float> [[TMP26]], poison
+; CHECK-NEXT: [[TMP36]] = extractelement <2 x float> [[TMP35]], i32 1
+; CHECK-NEXT: [[TMP37]] = extractelement <2 x float> [[TMP35]], i32 0
+; CHECK-NEXT: [[TMP38]] = fadd fast <2 x float> [[TMP28]], poison
+; CHECK-NEXT: [[TMP39]] = extractelement <2 x float> [[TMP38]], i32 1
+; CHECK-NEXT: [[TMP40]] = extractelement <2 x float> [[TMP38]], i32 0
; CHECK-NEXT: [[I135:%.*]] = fmul fast float [[I85]], [[I65]]
; CHECK-NEXT: [[I128:%.*]] = fmul fast float [[I80]], [[I65]]
; CHECK-NEXT: [[I129:%.*]] = fmul fast float [[I81]], [[I65]]
; CHECK-NEXT: [[I130:%.*]] = fmul fast float [[I82]], [[I65]]
; CHECK-NEXT: [[I133:%.*]] = fmul fast float [[I84]], [[I77]]
-; CHECK-NEXT: [[I134:%.*]] = fadd fast float [[I133]], [[I135]]
; CHECK-NEXT: [[I136:%.*]] = fmul fast float [[I127]], [[I77]]
-; CHECK-NEXT: [[TMP51:%.*]] = fadd fast float [[I136]], [[I128]]
; CHECK-NEXT: [[I138:%.*]] = fmul fast float [[I131]], [[I77]]
-; CHECK-NEXT: [[TMP52:%.*]] = fadd fast float [[I138]], [[I129]]
; CHECK-NEXT: [[I137:%.*]] = fmul fast float [[I86]], [[I77]]
-; CHECK-NEXT: [[I139:%.*]] = fadd fast float [[I137]], [[I130]]
-; CHECK-NEXT: [[I140:%.*]] = fadd fast float [[I134]], poison
-; CHECK-NEXT: [[I141:%.*]] = fadd fast float [[TMP51]], poison
-; CHECK-NEXT: [[I142:%.*]] = fadd fast float [[TMP52]], poison
-; CHECK-NEXT: [[I143:%.*]] = fadd fast float [[I139]], poison
-; CHECK-NEXT: [[I144:%.*]] = fadd fast float [[I140]], poison
-; CHECK-NEXT: [[I145:%.*]] = fadd fast float [[I141]], poison
-; CHECK-NEXT: [[I146:%.*]] = fadd fast float [[I142]], poison
-; CHECK-NEXT: [[I152:%.*]] = fadd fast float [[I143]], poison
-; CHECK-NEXT: [[I147:%.*]] = fmul fast float [[I85]], poison
-; CHECK-NEXT: [[I148:%.*]] = fmul fast float [[I80]], poison
-; CHECK-NEXT: [[I149:%.*]] = fmul fast float [[I81]], poison
-; CHECK-NEXT: [[I150:%.*]] = fmul fast float [[I82]], poison
-; CHECK-NEXT: [[I151:%.*]] = fmul fast float [[I84]], poison
-; CHECK-NEXT: [[TMP57:%.*]] = fadd fast float [[I151]], [[I147]]
-; CHECK-NEXT: [[I153:%.*]] = fmul fast float [[I127]], poison
-; CHECK-NEXT: [[TMP58:%.*]] = fadd fast float [[I153]], [[I148]]
-; CHECK-NEXT: [[I155:%.*]] = fmul fast float [[I131]], poison
-; CHECK-NEXT: [[TMP59:%.*]] = fadd fast float [[I155]], [[I149]]
-; CHECK-NEXT: [[I157:%.*]] = fmul fast float [[I86]], poison
+; CHECK-NEXT: [[OP_RDX14:%.*]] = fadd fast float poison, [[I133]]
+; CHECK-NEXT: [[OP_RDX15:%.*]] = fadd fast float [[OP_RDX14]], [[I135]]
+; CHECK-NEXT: [[OP_RDX12:%.*]] = fadd fast float poison, [[I136]]
+; CHECK-NEXT: [[OP_RDX13:%.*]] = fadd fast float [[OP_RDX12]], [[I128]]
+; CHECK-NEXT: [[OP_RDX10:%.*]] = fadd fast float poison, [[I138]]
+; CHECK-NEXT: [[OP_RDX11:%.*]] = fadd fast float [[OP_RDX10]], [[I129]]
+; CHECK-NEXT: [[OP_RDX8:%.*]] = fadd fast float poison, [[I137]]
+; CHECK-NEXT: [[OP_RDX9:%.*]] = fadd fast float [[OP_RDX8]], [[I130]]
+; CHECK-NEXT: [[TMP41:%.*]] = fmul fast <2 x float> [[TMP8]], poison
+; CHECK-NEXT: [[TMP42:%.*]] = fmul fast <2 x float> [[TMP7]], poison
+; CHECK-NEXT: [[TMP43:%.*]] = fmul fast <2 x float> [[TMP6]], poison
+; CHECK-NEXT: [[TMP44:%.*]] = fmul fast <2 x float> [[TMP5]], poison
+; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x float> [[TMP41]], i32 1
+; CHECK-NEXT: [[I157:%.*]] = fadd fast float poison, [[TMP45]]
+; CHECK-NEXT: [[I150:%.*]] = extractelement <2 x float> [[TMP41]], i32 0
; CHECK-NEXT: [[TMP60:%.*]] = fadd fast float [[I157]], [[I150]]
-; CHECK-NEXT: [[I159:%.*]] = fadd fast float [[TMP57]], poison
-; CHECK-NEXT: [[I160:%.*]] = fadd fast float [[TMP58]], poison
-; CHECK-NEXT: [[I161:%.*]] = fadd fast float [[TMP59]], poison
-; CHECK-NEXT: [[I162:%.*]] = fadd fast float [[TMP60]], poison
-; CHECK-NEXT: [[I163:%.*]] = fadd fast float [[I159]], poison
-; CHECK-NEXT: [[I164:%.*]] = fadd fast float [[I160]], poison
-; CHECK-NEXT: [[I165:%.*]] = fadd fast float [[I161]], poison
-; CHECK-NEXT: [[I166:%.*]] = fadd fast float [[I162]], poison
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x float> [[TMP42]], i32 1
+; CHECK-NEXT: [[OP_RDX4:%.*]] = fadd fast float poison, [[TMP47]]
+; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x float> [[TMP42]], i32 0
+; CHECK-NEXT: [[OP_RDX5:%.*]] = fadd fast float [[OP_RDX4]], [[TMP48]]
+; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x float> [[TMP43]], i32 1
+; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float poison, [[TMP49]]
+; CHECK-NEXT: [[TMP50:%.*]] = extractelement <2 x float> [[TMP43]], i32 0
+; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP50]]
+; CHECK-NEXT: [[TMP51:%.*]] = extractelement <2 x float> [[TMP44]], i32 0
+; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float poison, [[TMP51]]
+; CHECK-NEXT: [[TMP52:%.*]] = extractelement <2 x float> [[TMP44]], i32 1
+; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP52]]
; CHECK-NEXT: br i1 poison, label %[[BB78]], label %[[BB167]]
; CHECK: [[BB167]]:
-; CHECK-NEXT: [[I168:%.*]] = phi nsz float [ [[I76]], %[[BB64]] ], [ [[I166]], %[[BB78]] ]
-; CHECK-NEXT: [[I169:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I165]], %[[BB78]] ]
-; CHECK-NEXT: [[I170:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I164]], %[[BB78]] ]
-; CHECK-NEXT: [[I171:%.*]] = phi nsz float [ [[I75]], %[[BB64]] ], [ [[I163]], %[[BB78]] ]
-; CHECK-NEXT: [[I172:%.*]] = phi nsz float [ [[I74]], %[[BB64]] ], [ [[I152]], %[[BB78]] ]
-; CHECK-NEXT: [[I173:%.*]] = phi nsz float [ [[I73]], %[[BB64]] ], [ [[I146]], %[[BB78]] ]
-; CHECK-NEXT: [[TMP34:%.*]] = phi nsz float [ [[I72]], %[[BB64]] ], [ [[I145]], %[[BB78]] ]
-; CHECK-NEXT: [[I175:%.*]] = phi nsz float [ [[I71]], %[[BB64]] ], [ [[I144]], %[[BB78]] ]
-; CHECK-NEXT: [[I176:%.*]] = phi nsz float [ [[I70]], %[[BB64]] ], [ [[I126]], %[[BB78]] ]
-; CHECK-NEXT: [[I177:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I125]], %[[BB78]] ]
-; CHECK-NEXT: [[I178:%.*]] = phi nsz float [ [[I69]], %[[BB64]] ], [ [[I124]], %[[BB78]] ]
-; CHECK-NEXT: [[I179:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I123]], %[[BB78]] ]
-; CHECK-NEXT: [[I180:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I106]], %[[BB78]] ]
-; CHECK-NEXT: [[I181:%.*]] = phi nsz float [ [[I68]], %[[BB64]] ], [ [[I105]], %[[BB78]] ]
-; CHECK-NEXT: [[TMP33:%.*]] = phi nsz float [ [[I67]], %[[BB64]] ], [ [[I104]], %[[BB78]] ]
-; CHECK-NEXT: [[I183:%.*]] = phi nsz float [ [[I66]], %[[BB64]] ], [ [[I103]], %[[BB78]] ]
+; CHECK-NEXT: [[I168:%.*]] = phi nsz float [ [[I76]], %[[BB64]] ], [ [[OP_RDX1]], %[[BB78]] ]
+; CHECK-NEXT: [[I169:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[OP_RDX3]], %[[BB78]] ]
+; CHECK-NEXT: [[I170:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[OP_RDX5]], %[[BB78]] ]
+; CHECK-NEXT: [[I171:%.*]] = phi nsz float [ [[I75]], %[[BB64]] ], [ [[TMP60]], %[[BB78]] ]
+; CHECK-NEXT: [[I172:%.*]] = phi nsz float [ [[I74]], %[[BB64]] ], [ [[OP_RDX9]], %[[BB78]] ]
+; CHECK-NEXT: [[I173:%.*]] = phi nsz float [ [[I73]], %[[BB64]] ], [ [[OP_RDX11]], %[[BB78]] ]
+; CHECK-NEXT: [[TMP34:%.*]] = phi nsz float [ [[I72]], %[[BB64]] ], [ [[OP_RDX13]], %[[BB78]] ]
+; CHECK-NEXT: [[I175:%.*]] = phi nsz float [ [[I71]], %[[BB64]] ], [ [[OP_RDX15]], %[[BB78]] ]
+; CHECK-NEXT: [[I176:%.*]] = phi nsz float [ [[I70]], %[[BB64]] ], [ [[TMP40]], %[[BB78]] ]
+; CHECK-NEXT: [[I177:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP36]], %[[BB78]] ]
+; CHECK-NEXT: [[I178:%.*]] = phi nsz float [ [[I69]], %[[BB64]] ], [ [[TMP53]], %[[BB78]] ]
+; CHECK-NEXT: [[I179:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP30]], %[[BB78]] ]
+; CHECK-NEXT: [[I180:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP39]], %[[BB78]] ]
+; CHECK-NEXT: [[I181:%.*]] = phi nsz float [ [[I68]], %[[BB64]] ], [ [[TMP37]], %[[BB78]] ]
+; CHECK-NEXT: [[TMP33:%.*]] = phi nsz float [ [[I67]], %[[BB64]] ], [ [[TMP46]], %[[BB78]] ]
+; CHECK-NEXT: [[I183:%.*]] = phi nsz float [ [[I66]], %[[BB64]] ], [ [[TMP31]], %[[BB78]] ]
; CHECK-NEXT: store float [[TMP33]], ptr poison, align 1
; CHECK-NEXT: store float [[TMP34]], ptr poison, align 1
; CHECK-NEXT: br i1 poison, label %[[BB186:.*]], label %[[BB184:.*]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
index 8093285ad8717..5fb468c0b0962 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll
@@ -8,56 +8,35 @@
define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) {
; CHECK-LABEL: @zot(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00
; CHECK-NEXT: [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]]
-; CHECK-NEXT: [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00
-; CHECK-NEXT: [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT: [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00
-; CHECK-NEXT: [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00
-; CHECK-NEXT: [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00
-; CHECK-NEXT: [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
-; CHECK-NEXT: [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> <float 0.000000e+00, float poison, float poison, float poison>, float [[ARG]], i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> <float poison, float 0.000000e+00>, float [[ARG3]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], <float 1.000000e+00, float 0.000000e+00>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP6]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[TMP7]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]]
; CHECK: bb18:
-; CHECK-NEXT: [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ]
-; CHECK-NEXT: [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ]
-; CHECK-NEXT: [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ]
-; CHECK-NEXT: [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP8]], [[BB:%.*]] ]
+; CHECK-NEXT: [[VAL16:%.*]] = extractelement <4 x float> [[TMP8]], i32 2
; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00
+; CHECK-NEXT: [[VAL17:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00
; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]]
; CHECK: bb25:
-; CHECK-NEXT: [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ]
-; CHECK-NEXT: [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ]
-; CHECK-NEXT: [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ]
-; CHECK-NEXT: [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ]
+; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP9]], [[BB18]] ]
; CHECK-NEXT: br label [[BB30:%.*]]
; CHECK: bb30:
; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ]
-; CHECK-NEXT: [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1
-; CHECK-NEXT: [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float
-; CHECK-NEXT: [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1
-; CHECK-NEXT: [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1
-; CHECK-NEXT: [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float
-; CHECK-NEXT: [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2
-; CHECK-NEXT: [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1
-; CHECK-NEXT: [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float
-; CHECK-NEXT: [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3
-; CHECK-NEXT: [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1
-; CHECK-NEXT: [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float
-; CHECK-NEXT: [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]]
-; CHECK-NEXT: [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]]
-; CHECK-NEXT: [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]]
-; CHECK-NEXT: [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]]
-; CHECK-NEXT: [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]]
-; CHECK-NEXT: [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]]
-; CHECK-NEXT: [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]]
-; CHECK-NEXT: [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]]
-; CHECK-NEXT: [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]]
-; CHECK-NEXT: [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]]
-; CHECK-NEXT: [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]]
+; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = uitofp <4 x i8> [[TMP13]] to <4 x float>
+; CHECK-NEXT: [[TMP15:%.*]] = fsub fast <4 x float> [[TMP14]], [[TMP3]]
+; CHECK-NEXT: [[TMP16:%.*]] = fmul fast <4 x float> [[TMP15]], [[TMP12]]
+; CHECK-NEXT: [[VAL54:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP16]])
; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]])
; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]])
; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]])
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 430a46beace9a..9d2e22bb454e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -600,25 +600,34 @@ define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
}
define float @dot_product_fp32(ptr %a, ptr %b) {
-; CHECK-LABEL: @dot_product_fp32(
-; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; CHECK-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
-; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
-; CHECK-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
-; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
-; CHECK-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; CHECK-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
-; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
-; CHECK-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
-; CHECK-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]]
-; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]]
-; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
-; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; CHECK-NEXT: ret float [[ADD_1]]
+; NON-POW2-LABEL: @dot_product_fp32(
+; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
+; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
+; NON-POW2-NEXT: ret float [[TMP4]]
+;
+; POW2-ONLY-LABEL: @dot_product_fp32(
+; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
+; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
+; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
+; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
+; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]]
+; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]]
+; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
+; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
+; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; POW2-ONLY-NEXT: ret float [[ADD_1]]
;
%gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
%l.a.0 = load float, ptr %gep.a.0, align 4
@@ -646,25 +655,34 @@ define float @dot_product_fp32(ptr %a, ptr %b) {
; Same as above, except the reduction order has been perturbed. This
; is checking for our ability to reorder.
define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
-; CHECK-LABEL: @dot_product_fp32_reorder(
-; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
-; CHECK-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
-; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
-; CHECK-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
-; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
-; CHECK-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
-; CHECK-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
-; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
-; CHECK-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
-; CHECK-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]]
-; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]]
-; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_1]], [[MUL_0]]
-; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; CHECK-NEXT: ret float [[ADD_1]]
+; NON-POW2-LABEL: @dot_product_fp32_reorder(
+; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]]
+; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]])
+; NON-POW2-NEXT: ret float [[TMP4]]
+;
+; POW2-ONLY-LABEL: @dot_product_fp32_reorder(
+; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0
+; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1
+; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
+; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
+; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0
+; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4
+; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1
+; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
+; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]]
+; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]]
+; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
+; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_1]], [[MUL_0]]
+; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; POW2-ONLY-NEXT: ret float [[ADD_1]]
;
%gep.a.0 = getelementptr inbounds float, ptr %a, i32 0
%l.a.0 = load float, ptr %gep.a.0, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index 0879ec239e287..a333d162297bc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -95,47 +95,12 @@ define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt
}
define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
-; SSE2-LABEL: @dot4f64_fast(
-; SSE2-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
-; SSE2-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
-; SSE2-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
-; SSE2-NEXT: ret double [[TMP4]]
-;
-; SSE4-LABEL: @dot4f64_fast(
-; SSE4-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
-; SSE4-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
-; SSE4-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
-; SSE4-NEXT: ret double [[TMP4]]
-;
-; AVX-LABEL: @dot4f64_fast(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
-; AVX-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
-; AVX-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
-; AVX-NEXT: ret double [[TMP4]]
-;
-; AVX2-LABEL: @dot4f64_fast(
-; AVX2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
-; AVX2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
-; AVX2-NEXT: [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX]], i64 2
-; AVX2-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY]], i64 2
-; AVX2-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4
-; AVX2-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
-; AVX2-NEXT: [[X1:%.*]] = load double, ptr [[PTRX1]], align 4
-; AVX2-NEXT: [[Y1:%.*]] = load double, ptr [[PTRY1]], align 4
-; AVX2-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
-; AVX2-NEXT: [[MUL1:%.*]] = fmul double [[X1]], [[Y1]]
-; AVX2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4
-; AVX2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4
-; AVX2-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
-; AVX2-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[MUL1]]
-; AVX2-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; AVX2-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP4]]
-; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; AVX2-NEXT: [[DOT0123:%.*]] = fadd fast double [[DOT012]], [[TMP5]]
-; AVX2-NEXT: ret double [[DOT0123]]
+; CHECK-LABEL: @dot4f64_fast(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; CHECK-NEXT: ret double [[TMP4]]
;
%ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
%ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
@@ -162,47 +127,12 @@ define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3
}
define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
-; SSE2-LABEL: @dot4f32_fast(
-; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
-; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
-; SSE2-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
-; SSE2-NEXT: ret float [[TMP4]]
-;
-; SSE4-LABEL: @dot4f32_fast(
-; SSE4-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
-; SSE4-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
-; SSE4-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
-; SSE4-NEXT: ret float [[TMP4]]
-;
-; AVX-LABEL: @dot4f32_fast(
-; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
-; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
-; AVX-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
-; AVX-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
-; AVX-NEXT: ret float [[TMP4]]
-;
-; AVX2-LABEL: @dot4f32_fast(
-; AVX2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
-; AVX2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
-; AVX2-NEXT: [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX]], i64 2
-; AVX2-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY]], i64 2
-; AVX2-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4
-; AVX2-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
-; AVX2-NEXT: [[X1:%.*]] = load float, ptr [[PTRX1]], align 4
-; AVX2-NEXT: [[Y1:%.*]] = load float, ptr [[PTRY1]], align 4
-; AVX2-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
-; AVX2-NEXT: [[MUL1:%.*]] = fmul float [[X1]], [[Y1]]
-; AVX2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4
-; AVX2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4
-; AVX2-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
-; AVX2-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[MUL1]]
-; AVX2-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; AVX2-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP4]]
-; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; AVX2-NEXT: [[DOT0123:%.*]] = fadd fast float [[DOT012]], [[TMP5]]
-; AVX2-NEXT: ret float [[DOT0123]]
+; CHECK-LABEL: @dot4f32_fast(
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
+; CHECK-NEXT: ret float [[TMP4]]
;
%ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
%ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index 0bbdeb55e1516..378d3b00b1158 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -73,41 +73,14 @@ define float @bazz() {
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16
-; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16
-; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
-; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
-; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
-; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
-; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
-; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT: [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
-; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
-; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
-; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT: [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
-; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
-; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16
-; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16
-; CHECK-NEXT: [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]]
-; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4
-; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4
-; CHECK-NEXT: [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]]
-; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]]
-; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8
-; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8
-; CHECK-NEXT: [[MUL18_2:%.*]] = fmul fast float [[TMP14]], [[TMP13]]
-; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[MUL18_2]], [[ADD19_1]]
-; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4
-; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4
-; CHECK-NEXT: [[MUL18_3:%.*]] = fmul fast float [[TMP16]], [[TMP15]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[MUL18_3]], [[ADD19_2]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16
+; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
+; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
; CHECK-NEXT: store float [[OP_RDX1]], ptr @res, align 4
; CHECK-NEXT: ret float [[OP_RDX1]]
;
@@ -116,41 +89,14 @@ define float @bazz() {
; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4
; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16
-; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16
-; THRESHOLD-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]]
-; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
-; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
-; THRESHOLD-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]]
-; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]]
-; THRESHOLD-NEXT: [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
-; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
-; THRESHOLD-NEXT: [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
-; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]]
-; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
-; THRESHOLD-NEXT: [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
-; THRESHOLD-NEXT: [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
-; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]]
; THRESHOLD-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
; THRESHOLD-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
-; THRESHOLD-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
-; THRESHOLD-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16
-; THRESHOLD-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16
-; THRESHOLD-NEXT: [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]]
-; THRESHOLD-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4
-; THRESHOLD-NEXT: [[TMP12:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4
-; THRESHOLD-NEXT: [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]]
-; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]]
-; THRESHOLD-NEXT: [[TMP13:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8
-; THRESHOLD-NEXT: [[TMP14:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8
-; THRESHOLD-NEXT: [[MUL18_2:%.*]] = fmul fast float [[TMP14]], [[TMP13]]
-; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float [[MUL18_2]], [[ADD19_1]]
-; THRESHOLD-NEXT: [[TMP15:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4
-; THRESHOLD-NEXT: [[TMP16:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4
-; THRESHOLD-NEXT: [[MUL18_3:%.*]] = fmul fast float [[TMP16]], [[TMP15]]
-; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[MUL18_3]], [[ADD19_2]]
+; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16
+; THRESHOLD-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16
+; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
+; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]]
; THRESHOLD-NEXT: store float [[OP_RDX1]], ptr @res, align 4
; THRESHOLD-NEXT: ret float [[OP_RDX1]]
;
@@ -202,21 +148,10 @@ define float @bazzz() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16
-; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16
-; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
-; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
-; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
-; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
-; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
-; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
-; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
+; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
; CHECK-NEXT: store float [[TMP5]], ptr @res, align 4
; CHECK-NEXT: ret float [[TMP5]]
@@ -225,21 +160,10 @@ define float @bazzz() {
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4
; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16
-; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16
-; THRESHOLD-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
-; THRESHOLD-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
-; THRESHOLD-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
-; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
-; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
-; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
-; THRESHOLD-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
-; THRESHOLD-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
-; THRESHOLD-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
-; THRESHOLD-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
-; THRESHOLD-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; THRESHOLD-NEXT: [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
+; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
+; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
+; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
; THRESHOLD-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
; THRESHOLD-NEXT: store float [[TMP5]], ptr @res, align 4
; THRESHOLD-NEXT: ret float [[TMP5]]
@@ -272,21 +196,10 @@ define i32 @foo() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4
; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16
-; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16
-; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
-; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
-; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
-; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
-; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
-; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
-; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
-; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
-; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
-; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
+; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
; CHECK-NEXT: [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
; CHECK-NEXT: store i32 [[CONV4]], ptr @n, align 4
@@ -296,21 +209,10 @@ define i32 @foo() {
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4
; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
-; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16
-; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16
-; THRESHOLD-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]]
-; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4
-; THRESHOLD-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4
-; THRESHOLD-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]]
-; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]]
-; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8
-; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8
-; THRESHOLD-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]]
-; THRESHOLD-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]]
-; THRESHOLD-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4
-; THRESHOLD-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4
-; THRESHOLD-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]]
-; THRESHOLD-NEXT: [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]]
+; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16
+; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16
+; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]])
; THRESHOLD-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]]
; THRESHOLD-NEXT: [[CONV4:%.*]] = fptosi float [[TMP5]] to i32
; THRESHOLD-NEXT: store i32 [[CONV4]], ptr @n, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
index 45279296d296a..8ba2859add9e0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -11,66 +11,23 @@ define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
-; CHECK-NEXT: [[LD1_0:%.*]] = load double, ptr [[GEP1_0]], align 8
-; CHECK-NEXT: [[LD0_0:%.*]] = load double, ptr [[ARG1:%.*]], align 8
-; CHECK-NEXT: [[MUL1_0:%.*]] = fmul fast double [[LD0_0]], [[LD1_0]]
-; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 16
-; CHECK-NEXT: [[LD2_0:%.*]] = load double, ptr [[GEP2_0]], align 8
-; CHECK-NEXT: [[MUL2_0:%.*]] = fmul fast double [[LD2_0]], [[LD1_0]]
-; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 3
-; CHECK-NEXT: [[LD1_1:%.*]] = load double, ptr [[GEP1_1]], align 8
-; CHECK-NEXT: [[GEP0_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 1
-; CHECK-NEXT: [[LD0_1:%.*]] = load double, ptr [[GEP0_1]], align 8
-; CHECK-NEXT: [[MUL1_1:%.*]] = fmul fast double [[LD0_1]], [[LD1_1]]
-; CHECK-NEXT: [[RDX1_0:%.*]] = fadd fast double [[MUL1_0]], [[MUL1_1]]
-; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 17
-; CHECK-NEXT: [[LD2_1:%.*]] = load double, ptr [[GEP2_1]], align 8
-; CHECK-NEXT: [[MUL2_1:%.*]] = fmul fast double [[LD2_1]], [[LD1_1]]
-; CHECK-NEXT: [[RDX2_0:%.*]] = fadd fast double [[MUL2_0]], [[MUL2_1]]
-; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 5
-; CHECK-NEXT: [[LD1_2:%.*]] = load double, ptr [[GEP1_2]], align 8
-; CHECK-NEXT: [[GEP0_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 2
-; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 18
-; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 7
-; CHECK-NEXT: [[LD1_3:%.*]] = load double, ptr [[GEP1_3]], align 8
-; CHECK-NEXT: [[GEP1_4:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 9
-; CHECK-NEXT: [[LD1_4:%.*]] = load double, ptr [[GEP1_4]], align 8
-; CHECK-NEXT: [[GEP1_5:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 11
-; CHECK-NEXT: [[LD1_5:%.*]] = load double, ptr [[GEP1_5]], align 8
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP0_2]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[LD1_2]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[LD1_3]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[LD1_4]], i32 2
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[LD1_5]], i32 3
+; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8
+; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20
+; CHECK-NEXT: [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x double> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT: [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
+; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP4]]
+; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP12]])
; CHECK-NEXT: [[TMP6:%.*]] = load <4 x double>, ptr [[GEP2_2]], align 8
-; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP4]]
-; CHECK-NEXT: [[GEP1_6:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 13
-; CHECK-NEXT: [[LD1_6:%.*]] = load double, ptr [[GEP1_6]], align 8
-; CHECK-NEXT: [[GEP0_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 6
-; CHECK-NEXT: [[LD0_6:%.*]] = load double, ptr [[GEP0_6]], align 8
-; CHECK-NEXT: [[MUL1_6:%.*]] = fmul fast double [[LD0_6]], [[LD1_6]]
-; CHECK-NEXT: [[GEP2_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 22
-; CHECK-NEXT: [[LD2_6:%.*]] = load double, ptr [[GEP2_6]], align 8
-; CHECK-NEXT: [[MUL2_6:%.*]] = fmul fast double [[LD2_6]], [[LD1_6]]
-; CHECK-NEXT: [[GEP1_7:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 15
-; CHECK-NEXT: [[LD1_7:%.*]] = load double, ptr [[GEP1_7]], align 8
-; CHECK-NEXT: [[GEP0_7:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 7
-; CHECK-NEXT: [[LD0_7:%.*]] = load double, ptr [[GEP0_7]], align 8
-; CHECK-NEXT: [[MUL1_7:%.*]] = fmul fast double [[LD0_7]], [[LD1_7]]
-; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]])
-; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast double [[TMP10]], [[MUL1_6]]
-; CHECK-NEXT: [[OP_RDX4:%.*]] = fadd fast double [[MUL1_7]], [[RDX1_0]]
-; CHECK-NEXT: [[TMP8:%.*]] = fadd fast double [[OP_RDX3]], [[OP_RDX4]]
-; CHECK-NEXT: [[GEP2_7:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 23
-; CHECK-NEXT: [[LD2_7:%.*]] = load double, ptr [[GEP2_7]], align 8
-; CHECK-NEXT: [[MUL2_7:%.*]] = fmul fast double [[LD2_7]], [[LD1_7]]
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP10]]
; CHECK-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP7]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP11]], [[MUL2_6]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast double [[MUL2_7]], [[RDX2_0]]
-; CHECK-NEXT: [[TMP9:%.*]] = fadd fast double [[OP_RDX]], [[OP_RDX1]]
; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
-; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
+; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
index 33c281d3f0166..f0272d591f0c3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll
@@ -6,25 +6,9 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
define void @rdx_feeds_single_insert(<2 x double> %v, ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
; CHECK-LABEL: @rdx_feeds_single_insert(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[LD0_0:%.*]] = load double, ptr [[ARG1:%.*]], align 8
-; CHECK-NEXT: [[MUL1_0:%.*]] = fmul fast double [[LD0_0]], 1.000000e+01
-; CHECK-NEXT: [[GEP0_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 1
-; CHECK-NEXT: [[LD0_1:%.*]] = load double, ptr [[GEP0_1]], align 8
-; CHECK-NEXT: [[MUL1_1:%.*]] = fmul fast double [[LD0_1]], 1.100000e+01
-; CHECK-NEXT: [[RDX1_0:%.*]] = fadd fast double [[MUL1_0]], [[MUL1_1]]
-; CHECK-NEXT: [[GEP0_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 2
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP0_2]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <4 x double> [[TMP0]], <double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01>
-; CHECK-NEXT: [[GEP0_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 6
-; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[GEP0_6]], align 8
-; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[TMP3]], <double 1.600000e+01, double 1.700000e+01>
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[RDX1_0]], i32 1
-; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP4]]
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = fadd fast double [[TMP8]], [[TMP9]]
+; CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr [[ARG1:%.*]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <8 x double> [[TMP0]], <double 1.000000e+01, double 1.100000e+01, double 1.200000e+01, double 1.300000e+01, double 1.400000e+01, double 1.500000e+01, double 1.600000e+01, double 1.700000e+01>
+; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP1]])
; CHECK-NEXT: [[I:%.*]] = insertelement <2 x double> [[V:%.*]], double [[TMP2]], i64 1
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
index 359c24b00e92e..a70a1f67354c2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=SSE4
; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX
-; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX
+; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX2
; This test checks for a case when a horizontal reduction of floating-point
; adds may look profitable, but is not because it eliminates generation of
@@ -34,6 +34,19 @@ define void @hr() {
; AVX-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]]
; AVX: exit:
; AVX-NEXT: ret void
+;
+; AVX2-LABEL: @hr(
+; AVX2-NEXT: br label [[LOOP:%.*]]
+; AVX2: loop:
+; AVX2-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
+; AVX2-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double
+; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
+; AVX2-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
+; AVX2-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
+; AVX2-NEXT: [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
+; AVX2-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]]
+; AVX2: exit:
+; AVX2-NEXT: ret void
;
br label %loop
@@ -80,6 +93,15 @@ define double @hr_or_mul() {
; AVX-NEXT: [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]]
; AVX-NEXT: [[ADD3:%.*]] = fadd fast double [[MUL3]], [[ADD2]]
; AVX-NEXT: ret double [[ADD3]]
+;
+; AVX2-LABEL: @hr_or_mul(
+; AVX2-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double
+; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
+; AVX2-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
+; AVX2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
+; AVX2-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
+; AVX2-NEXT: ret double [[OP_RDX]]
;
%cvt0 = uitofp i16 3 to double
%mul0 = fmul fast double 7.000000e+00, %cvt0
>From 5bbf933ff0236d13b06b13f2d0e0c4289f64111b Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Fri, 8 Aug 2025 20:26:50 +0000
Subject: [PATCH 2/2] Fix formatting
Created using spr 1.3.5
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index afdf73ab58184..9d5b73fc85779 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23720,8 +23720,8 @@ class HorizontalReduction {
FMF &= FPCI->getFastMathFlags();
Ops.push_back(RdxVal->user_back());
}
- FMACost = canConvertToFMA(
- Ops, getSameOpcode(Ops, TLI), DT, DL, *TTI, TLI);
+ FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
+ *TTI, TLI);
if (FMACost.isValid()) {
// Calculate actual FMAD cost.
IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
@@ -23732,7 +23732,8 @@ class HorizontalReduction {
// Also, exclude vector fmul cost.
InstructionCost FMulCost = TTI->getArithmeticInstrCost(
Instruction::FMul, RVecTy, CostKind);
- LLVM_DEBUG(dbgs() << "Minus vector FMul cost: " << FMulCost << "\n");
+ LLVM_DEBUG(dbgs()
+ << "Minus vector FMul cost: " << FMulCost << "\n");
FMACost -= FMulCost;
}
}
More information about the llvm-commits
mailing list