[llvm] [SLP] Emit reduction instead of 2 extracts + scalar op, when vectorizing operands (PR #147583)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 8 11:49:19 PDT 2025
https://github.com/alexey-bataev updated https://github.com/llvm/llvm-project/pull/147583
>From 84ff08f8851e8587f26617f6cf9947830df73110 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Tue, 8 Jul 2025 18:37:52 +0000
Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
=?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.5
---
.../Transforms/Vectorize/SLPVectorizer.cpp | 169 ++++++++++++------
.../SLPVectorizer/AArch64/commute.ll | 8 +-
.../SLPVectorizer/AArch64/reduce-fadd.ll | 30 ++--
.../SLPVectorizer/AArch64/slp-fma-loss.ll | 10 +-
.../Transforms/SLPVectorizer/RISCV/revec.ll | 24 +--
.../SLPVectorizer/RISCV/vec3-base.ll | 38 ++--
.../SLPVectorizer/X86/dot-product.ll | 128 ++++++++-----
.../SLPVectorizer/extracts-with-undefs.ll | 6 +-
8 files changed, 236 insertions(+), 177 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c93af749507f8..bec393051a257 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -21676,58 +21676,6 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
return Changed;
}
-bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
- if (!I)
- return false;
-
- if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
- return false;
-
- Value *P = I->getParent();
-
- // Vectorize in current basic block only.
- auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
- auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
- if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
- R.isDeleted(Op0) || R.isDeleted(Op1))
- return false;
-
- // First collect all possible candidates
- SmallVector<std::pair<Value *, Value *>, 4> Candidates;
- Candidates.emplace_back(Op0, Op1);
-
- auto *A = dyn_cast<BinaryOperator>(Op0);
- auto *B = dyn_cast<BinaryOperator>(Op1);
- // Try to skip B.
- if (A && B && B->hasOneUse()) {
- auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
- auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
- if (B0 && B0->getParent() == P && !R.isDeleted(B0))
- Candidates.emplace_back(A, B0);
- if (B1 && B1->getParent() == P && !R.isDeleted(B1))
- Candidates.emplace_back(A, B1);
- }
- // Try to skip A.
- if (B && A && A->hasOneUse()) {
- auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
- auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
- if (A0 && A0->getParent() == P && !R.isDeleted(A0))
- Candidates.emplace_back(A0, B);
- if (A1 && A1->getParent() == P && !R.isDeleted(A1))
- Candidates.emplace_back(A1, B);
- }
-
- if (Candidates.size() == 1)
- return tryToVectorizeList({Op0, Op1}, R);
-
- // We have multiple options. Try to pick the single best.
- std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
- if (!BestCandidate)
- return false;
- return tryToVectorizeList(
- {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
-}
-
namespace {
/// Model horizontal reductions.
@@ -21770,6 +21718,8 @@ class HorizontalReduction {
/// Checks if the optimization of original scalar identity operations on
/// matched horizontal reductions is enabled and allowed.
bool IsSupportedHorRdxIdentityOp = false;
+ /// The minimum number of the reduced values.
+ const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
/// Contains vector values for reduction including their scale factor and
/// signedness.
SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
@@ -22068,6 +22018,24 @@ class HorizontalReduction {
public:
HorizontalReduction() = default;
+ HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
+ : ReductionRoot(I), ReductionLimit(2) {
+ RdxKind = HorizontalReduction::getRdxKind(I);
+ ReductionOps.emplace_back().push_back(I);
+ ReducedVals.emplace_back().assign(Ops.begin(), Ops.end());
+ for (Value *V : Ops)
+ ReducedValsToOps[V].push_back(I);
+ }
+
+ bool matchReductionForOperands() const {
+ // Analyze "regular" integer/FP types for reductions - no target-specific
+ // types or pointers.
+ assert(ReductionRoot && "Reduction root is not set!");
+ if (!isVectorizable(RdxKind, cast<Instruction>(ReductionRoot)))
+ return false;
+
+ return true;
+ }
/// Try to find a reduction tree.
bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
@@ -22235,7 +22203,6 @@ class HorizontalReduction {
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
const TargetLibraryInfo &TLI, AssumptionCache *AC) {
- const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
@@ -23740,6 +23707,102 @@ bool SLPVectorizerPass::vectorizeHorReduction(
return Res;
}
+bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
+ if (!I)
+ return false;
+
+ if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
+ return false;
+
+ Value *P = I->getParent();
+
+ // Vectorize in current basic block only.
+ auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+ if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
+ R.isDeleted(Op0) || R.isDeleted(Op1))
+ return false;
+
+ // First collect all possible candidates
+ SmallVector<std::pair<Value *, Value *>, 4> Candidates;
+ Candidates.emplace_back(Op0, Op1);
+
+ auto *A = dyn_cast<BinaryOperator>(Op0);
+ auto *B = dyn_cast<BinaryOperator>(Op1);
+ // Try to skip B.
+ if (A && B && B->hasOneUse()) {
+ auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (B0 && B0->getParent() == P && !R.isDeleted(B0))
+ Candidates.emplace_back(A, B0);
+ if (B1 && B1->getParent() == P && !R.isDeleted(B1))
+ Candidates.emplace_back(A, B1);
+ }
+ // Try to skip A.
+ if (B && A && A->hasOneUse()) {
+ auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (A0 && A0->getParent() == P && !R.isDeleted(A0))
+ Candidates.emplace_back(A0, B);
+ if (A1 && A1->getParent() == P && !R.isDeleted(A1))
+ Candidates.emplace_back(A1, B);
+ }
+
+ auto TryToReduce = [this, &R, &TTI=*TTI](Instruction *Inst, ArrayRef<Value *> Ops) {
+ if (!isReductionCandidate(Inst))
+ return false;
+ Type *Ty = Inst->getType();
+ if (!isValidElementType(Ty) || Ty->isPointerTy())
+ return false;
+ HorizontalReduction HorRdx(Inst, Ops);
+ if (!HorRdx.matchReductionForOperands())
+ return false;
+ // Check the cost of operations.
+ VectorType *VecTy= getWidenedType(Ty, Ops.size());
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ InstructionCost ScalarCost =
+ TTI.getScalarizationOverhead(
+ VecTy, APInt::getAllOnes(getNumElements(VecTy)), /*Insert=*/false,
+ /*Extract=*/true, CostKind) +
+ TTI.getInstructionCost(Inst, CostKind);
+ InstructionCost RedCost;
+ switch (::getRdxKind(Inst)) {
+ case RecurKind::Add:
+ case RecurKind::Mul:
+ case RecurKind::Or:
+ case RecurKind::And:
+ case RecurKind::Xor:
+ case RecurKind::FAdd:
+ case RecurKind::FMul: {
+ FastMathFlags FMF;
+ if (auto *FPCI = dyn_cast<FPMathOperator>(Inst))
+ FMF = FPCI->getFastMathFlags();
+ RedCost = TTI.getArithmeticReductionCost(Inst->getOpcode(), VecTy, FMF,
+ CostKind);
+ break;
+ }
+ default:
+ return false;
+ }
+ if (RedCost >= ScalarCost)
+ return false;
+
+ return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+ };
+ if (Candidates.size() == 1)
+ return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
+
+ // We have multiple options. Try to pick the single best.
+ std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
+ if (!BestCandidate)
+ return false;
+ return TryToReduce(I, {Candidates[*BestCandidate].first,
+ Candidates[*BestCandidate].second}) ||
+ tryToVectorizeList({Candidates[*BestCandidate].first,
+ Candidates[*BestCandidate].second},
+ R);
+}
+
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
BasicBlock *BB, BoUpSLP &R) {
SmallVector<WeakTrackingVH> PostponedInsts;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
index 19b6d82818532..442769937ac12 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -16,9 +16,7 @@ define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]]
+; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
; CHECK: for.end27:
@@ -57,9 +55,7 @@ define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]]
+; CHECK-NEXT: [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
; CHECK: for.end27:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index 677d52bf3b4c3..8d4a1152fe4da 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -3,13 +3,19 @@
; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16
define half @reduce_fast_half2(<2 x half> %vec2) {
-; CHECK-LABEL: define half @reduce_fast_half2(
-; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
-; CHECK-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
-; CHECK-NEXT: ret half [[ADD1]]
+; NOFP16-LABEL: define half @reduce_fast_half2(
+; NOFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; NOFP16-NEXT: [[ENTRY:.*:]]
+; NOFP16-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
+; NOFP16-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
+; NOFP16-NEXT: [[ADD1:%.*]] = fadd fast half [[ELT1]], [[ELT0]]
+; NOFP16-NEXT: ret half [[ADD1]]
+;
+; FULLFP16-LABEL: define half @reduce_fast_half2(
+; FULLFP16-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
+; FULLFP16-NEXT: [[ENTRY:.*:]]
+; FULLFP16-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> [[VEC2]])
+; FULLFP16-NEXT: ret half [[TMP0]]
;
entry:
%elt0 = extractelement <2 x half> %vec2, i64 0
@@ -20,7 +26,7 @@ entry:
define half @reduce_half2(<2 x half> %vec2) {
; CHECK-LABEL: define half @reduce_half2(
-; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x half> [[VEC2:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x half> [[VEC2]], i64 0
; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x half> [[VEC2]], i64 1
@@ -269,9 +275,7 @@ define float @reduce_fast_float2(<2 x float> %vec2) {
; CHECK-LABEL: define float @reduce_fast_float2(
; CHECK-SAME: <2 x float> [[VEC2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[VEC2]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[VEC2]], i64 1
-; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[ELT1]], [[ELT0]]
+; CHECK-NEXT: [[ADD1:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[VEC2]])
; CHECK-NEXT: ret float [[ADD1]]
;
entry:
@@ -409,9 +413,7 @@ define double @reduce_fast_double2(<2 x double> %vec2) {
; CHECK-LABEL: define double @reduce_fast_double2(
; CHECK-SAME: <2 x double> [[VEC2:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x double> [[VEC2]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x double> [[VEC2]], i64 1
-; CHECK-NEXT: [[ADD1:%.*]] = fadd fast double [[ELT1]], [[ELT0]]
+; CHECK-NEXT: [[ADD1:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[VEC2]])
; CHECK-NEXT: ret double [[ADD1]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
index 03f67ecb3e695..543f19225d74f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -216,9 +216,7 @@ define void @slp_profitable_missing_fmf_nnans_only(ptr %A, ptr %B) {
define float @slp_not_profitable_in_loop(float %x, ptr %A) {
; CHECK-LABEL: @slp_not_profitable_in_loop(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2
-; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4
; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[A]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> <float poison, float 3.000000e+00>, float [[X:%.*]], i32 0
; CHECK-NEXT: br label [[LOOP:%.*]]
@@ -226,12 +224,8 @@ define float @slp_not_profitable_in_loop(float %x, ptr %A) {
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]]
; CHECK-NEXT: [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]]
+; CHECK-NEXT: [[ADD13:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP2]])
; CHECK-NEXT: [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 10
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index 651f565412830..1116f8a7fbe27 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -141,33 +141,21 @@ define ptr @test4() {
; POWEROF2-NEXT: [[TMP1:%.*]] = fadd <8 x float> zeroinitializer, zeroinitializer
; POWEROF2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 1, i32 2>
; POWEROF2-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
-; POWEROF2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <2 x i32> <i32 0, i32 4>
; POWEROF2-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0)
; POWEROF2-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2)
; POWEROF2-NEXT: br label [[TMP8:%.*]]
-; POWEROF2: 7:
+; POWEROF2: 6:
; POWEROF2-NEXT: br label [[TMP8]]
-; POWEROF2: 8:
-; POWEROF2-NEXT: [[TMP9:%.*]] = phi <2 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP4]], [[TMP0:%.*]] ]
-; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7]] ], [ [[TMP6]], [[TMP0]] ]
+; POWEROF2: 7:
+; POWEROF2-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[TMP7:%.*]] ], [ [[TMP6]], [[TMP0:%.*]] ]
; POWEROF2-NEXT: br label [[TMP11:%.*]]
-; POWEROF2: 11:
+; POWEROF2: 9:
; POWEROF2-NEXT: [[TMP12:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 0)
; POWEROF2-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
; POWEROF2-NEXT: [[TMP14:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> [[TMP10]], i64 2)
; POWEROF2-NEXT: [[TMP15:%.*]] = fmul <2 x float> zeroinitializer, [[TMP14]]
-; POWEROF2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP9]], i32 0
-; POWEROF2-NEXT: [[TMP17:%.*]] = fmul float 0.000000e+00, [[TMP18]]
-; POWEROF2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP9]], i32 1
-; POWEROF2-NEXT: [[TMP19:%.*]] = fmul float [[TMP30]], 0.000000e+00
-; POWEROF2-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
-; POWEROF2-NEXT: [[TMP21:%.*]] = fadd reassoc nsz float [[TMP20]], [[TMP17]]
-; POWEROF2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
-; POWEROF2-NEXT: [[TMP23:%.*]] = fadd reassoc nsz float [[TMP22]], [[TMP19]]
-; POWEROF2-NEXT: [[TMP24:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
-; POWEROF2-NEXT: [[TMP25:%.*]] = fadd reassoc nsz float [[TMP21]], [[TMP24]]
-; POWEROF2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
-; POWEROF2-NEXT: [[TMP27:%.*]] = fadd reassoc nsz float [[TMP23]], [[TMP26]]
+; POWEROF2-NEXT: [[TMP25:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP13]])
+; POWEROF2-NEXT: [[TMP27:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP15]])
; POWEROF2-NEXT: [[TMP28:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP25]])
; POWEROF2-NEXT: [[TMP29:%.*]] = tail call float @llvm.sqrt.f32(float [[TMP27]])
; POWEROF2-NEXT: ret ptr null
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 481d586e6658a..27de36e601512 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -513,21 +513,16 @@ define i32 @dot_product_i32(ptr %a, ptr %b) {
;
; POW2-ONLY-LABEL: @dot_product_i32(
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
-; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
-; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
-; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP1]], [[TMP2]]
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
+; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]])
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
;
@@ -568,21 +563,16 @@ define i32 @dot_product_i32_reorder(ptr %a, ptr %b) {
;
; POW2-ONLY-LABEL: @dot_product_i32_reorder(
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 0
-; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load i32, ptr [[GEP_A_0]], align 4
-; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 1
-; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load i32, ptr [[GEP_A_1]], align 4
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 2
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load i32, ptr [[GEP_A_2]], align 4
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 0
-; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load i32, ptr [[GEP_B_0]], align 4
-; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 1
-; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load i32, ptr [[GEP_B_1]], align 4
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 2
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load i32, ptr [[GEP_B_2]], align 4
-; POW2-ONLY-NEXT: [[MUL_0:%.*]] = mul nsw i32 [[L_A_0]], [[L_B_0]]
-; POW2-ONLY-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[L_A_1]], [[L_B_1]]
+; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[GEP_B_0]], align 4
+; POW2-ONLY-NEXT: [[TMP3:%.*]] = mul nsw <2 x i32> [[TMP1]], [[TMP2]]
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT: [[ADD_0:%.*]] = add i32 [[MUL_1]], [[MUL_0]]
+; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP3]])
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[MUL_2]]
; POW2-ONLY-NEXT: ret i32 [[ADD_1]]
;
@@ -630,9 +620,7 @@ define float @dot_product_fp32(ptr %a, ptr %b) {
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
; POW2-ONLY-NEXT: ret float [[ADD_1]]
;
@@ -682,9 +670,7 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP_B_0]], align 4
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP2]]
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[TMP5]], [[TMP4]]
+; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
; POW2-ONLY-NEXT: ret float [[ADD_1]]
;
@@ -733,9 +719,7 @@ define double @dot_product_fp64(ptr %a, ptr %b) {
; POW2-ONLY-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[GEP_B_0]], align 4
; POW2-ONLY-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP1]], [[TMP2]]
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
-; POW2-ONLY-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; POW2-ONLY-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
+; POW2-ONLY-NEXT: [[ADD_0:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]])
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
; POW2-ONLY-NEXT: ret double [[ADD_1]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
index f16c879c451c2..6eb648f2b78ce 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,CHECK-NOAVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,CHECK-AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,CHECK-AVX
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,CHECK-AVX
;
; dot4(ptr x, ptr y) - ((xptr y[0])+(xptr y[1])+(xptr y[2])+(xptr y[3]))
@@ -231,20 +231,29 @@ define float @dot3f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt
}
define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) {
-; CHECK-LABEL: @dot3f64_fast(
-; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
-; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
-; CHECK-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4
-; CHECK-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
-; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; CHECK-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]]
-; CHECK-NEXT: ret double [[DOT012]]
+; CHECK-NOAVX-LABEL: @dot3f64_fast(
+; CHECK-NOAVX-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
+; CHECK-NOAVX-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
+; CHECK-NOAVX-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
+; CHECK-NOAVX-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
+; CHECK-NOAVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NOAVX-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]])
+; CHECK-NOAVX-NEXT: ret double [[TMP4]]
+;
+; CHECK-AVX-LABEL: @dot3f64_fast(
+; CHECK-AVX-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1
+; CHECK-AVX-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1
+; CHECK-AVX-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4
+; CHECK-AVX-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4
+; CHECK-AVX-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]]
+; CHECK-AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX1]], align 4
+; CHECK-AVX-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY1]], align 4
+; CHECK-AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-AVX-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP4]]
+; CHECK-AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; CHECK-AVX-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP5]]
+; CHECK-AVX-NEXT: ret double [[DOT012]]
;
%ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
%ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
@@ -265,20 +274,29 @@ define double @dot3f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3
}
define float @dot3f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
-; CHECK-LABEL: @dot3f32_fast(
-; CHECK-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
-; CHECK-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
-; CHECK-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4
-; CHECK-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
-; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; CHECK-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]]
-; CHECK-NEXT: ret float [[DOT012]]
+; CHECK-NOAVX-LABEL: @dot3f32_fast(
+; CHECK-NOAVX-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
+; CHECK-NOAVX-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
+; CHECK-NOAVX-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
+; CHECK-NOAVX-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
+; CHECK-NOAVX-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NOAVX-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
+; CHECK-NOAVX-NEXT: ret float [[TMP4]]
+;
+; CHECK-AVX-LABEL: @dot3f32_fast(
+; CHECK-AVX-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1
+; CHECK-AVX-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1
+; CHECK-AVX-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4
+; CHECK-AVX-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4
+; CHECK-AVX-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]]
+; CHECK-AVX-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX1]], align 4
+; CHECK-AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY1]], align 4
+; CHECK-AVX-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-AVX-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP4]]
+; CHECK-AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-AVX-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP5]]
+; CHECK-AVX-NEXT: ret float [[DOT012]]
;
%ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
%ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
@@ -347,14 +365,21 @@ define float @dot2f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt
}
define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
-; CHECK-LABEL: @dot2f64_fast(
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
-; CHECK-NEXT: ret double [[DOT01]]
+; CHECK-NOAVX-LABEL: @dot2f64_fast(
+; CHECK-NOAVX-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
+; CHECK-NOAVX-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
+; CHECK-NOAVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NOAVX-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP3]])
+; CHECK-NOAVX-NEXT: ret double [[TMP4]]
+;
+; CHECK-AVX-LABEL: @dot2f64_fast(
+; CHECK-AVX-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX:%.*]], align 4
+; CHECK-AVX-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY:%.*]], align 4
+; CHECK-AVX-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+; CHECK-AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+; CHECK-AVX-NEXT: [[DOT01:%.*]] = fadd fast double [[TMP4]], [[TMP5]]
+; CHECK-AVX-NEXT: ret double [[DOT01]]
;
%ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1
%ptry1 = getelementptr inbounds double, ptr %ptry, i64 1
@@ -369,14 +394,21 @@ define double @dot2f64_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(1
}
define float @dot2f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) {
-; CHECK-LABEL: @dot2f32_fast(
-; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
-; CHECK-NEXT: ret float [[DOT01]]
+; CHECK-NOAVX-LABEL: @dot2f32_fast(
+; CHECK-NOAVX-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
+; CHECK-NOAVX-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
+; CHECK-NOAVX-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-NOAVX-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP3]])
+; CHECK-NOAVX-NEXT: ret float [[TMP4]]
+;
+; CHECK-AVX-LABEL: @dot2f32_fast(
+; CHECK-AVX-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX:%.*]], align 4
+; CHECK-AVX-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY:%.*]], align 4
+; CHECK-AVX-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]]
+; CHECK-AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; CHECK-AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; CHECK-AVX-NEXT: [[DOT01:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; CHECK-AVX-NEXT: ret float [[DOT01]]
;
%ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1
%ptry1 = getelementptr inbounds float, ptr %ptry, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
index dca34b681032c..a64075db37ba1 100644
--- a/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll
@@ -9,9 +9,9 @@ define void @test() {
; CHECK: body:
; CHECK-NEXT: [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ zeroinitializer, [[BODY]] ]
-; CHECK-NEXT: [[MUL_I478_I:%.*]] = fmul fast double [[PHI1]], 0.000000e+00
-; CHECK-NEXT: [[MUL7_I485_I:%.*]] = fmul fast double undef, 0.000000e+00
-; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[MUL_I478_I]], [[MUL7_I485_I]]
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> <double poison, double 0.000000e+00>, double [[PHI1]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x double> <double 0.000000e+00, double undef>, [[TMP8]]
+; CHECK-NEXT: [[ADD8_I_I:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP9]])
; CHECK-NEXT: [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00
; CHECK-NEXT: br i1 false, label [[BODY]], label [[EXIT:%.*]]
; CHECK: exit:
>From 85f3eab645d9158d966d1142d74df00d7b8349a2 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Tue, 8 Jul 2025 18:49:09 +0000
Subject: [PATCH 2/2] Fix formatting
Created using spr 1.3.5
---
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bec393051a257..5b5115799e557 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23748,7 +23748,8 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
Candidates.emplace_back(A1, B);
}
- auto TryToReduce = [this, &R, &TTI=*TTI](Instruction *Inst, ArrayRef<Value *> Ops) {
+ auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
+ ArrayRef<Value *> Ops) {
if (!isReductionCandidate(Inst))
return false;
Type *Ty = Inst->getType();
@@ -23758,7 +23759,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (!HorRdx.matchReductionForOperands())
return false;
// Check the cost of operations.
- VectorType *VecTy= getWidenedType(Ty, Ops.size());
+ VectorType *VecTy = getWidenedType(Ty, Ops.size());
constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
InstructionCost ScalarCost =
TTI.getScalarizationOverhead(
More information about the llvm-commits
mailing list