[llvm] 6d902b6 - Revert "[SLP]Improved/fixed FMAD support in reductions"
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 2 07:58:41 PDT 2025
Author: Alexey Bataev
Date: 2025-09-02T07:58:29-07:00
New Revision: 6d902b67cdae224c0a5f4e13097a82d3148c8198
URL: https://github.com/llvm/llvm-project/commit/6d902b67cdae224c0a5f4e13097a82d3148c8198
DIFF: https://github.com/llvm/llvm-project/commit/6d902b67cdae224c0a5f4e13097a82d3148c8198.diff
LOG: Revert "[SLP]Improved/fixed FMAD support in reductions"
This reverts commit 74230ff2791384fb3285c9e9ab202056959aa095 to fix the
bugs found during local testing.
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e18ff6fed7eab..040e2dafb56a6 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23842,8 +23842,7 @@ class HorizontalReduction {
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
- const TargetLibraryInfo &TLI, AssumptionCache *AC,
- DominatorTree &DT) {
+ const TargetLibraryInfo &TLI, AssumptionCache *AC) {
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
@@ -24242,7 +24241,7 @@ class HorizontalReduction {
// Estimate cost.
InstructionCost ReductionCost =
- getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
+ getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
<< " for reduction\n");
@@ -24547,9 +24546,7 @@ class HorizontalReduction {
InstructionCost getReductionCost(TargetTransformInfo *TTI,
ArrayRef<Value *> ReducedVals,
bool IsCmpSelMinMax, FastMathFlags FMF,
- const BoUpSLP &R, DominatorTree &DT,
- const DataLayout &DL,
- const TargetLibraryInfo &TLI) {
+ const BoUpSLP &R) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Type *ScalarTy = ReducedVals.front()->getType();
unsigned ReduxWidth = ReducedVals.size();
@@ -24574,22 +24571,6 @@ class HorizontalReduction {
for (User *U : RdxVal->users()) {
auto *RdxOp = cast<Instruction>(U);
if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
- if (RdxKind == RecurKind::FAdd) {
- InstructionCost FMACost = canConvertToFMA(
- RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
- if (FMACost.isValid()) {
- LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
- if (auto *I = dyn_cast<Instruction>(RdxVal)) {
- // Also, exclude scalar fmul cost.
- InstructionCost FMulCost =
- TTI->getInstructionCost(I, CostKind);
- LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
- FMACost -= FMulCost;
- }
- ScalarCost += FMACost;
- continue;
- }
- }
ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
continue;
}
@@ -24654,43 +24635,8 @@ class HorizontalReduction {
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
std::make_pair(RedTy, true));
VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
- InstructionCost FMACost = InstructionCost::getInvalid();
- if (RdxKind == RecurKind::FAdd) {
- // Check if the reduction operands can be converted to FMA.
- SmallVector<Value *> Ops;
- FastMathFlags FMF;
- FMF.set();
- for (Value *RdxVal : ReducedVals) {
- if (!RdxVal->hasOneUse()) {
- Ops.clear();
- break;
- }
- if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
- FMF &= FPCI->getFastMathFlags();
- Ops.push_back(RdxVal->user_back());
- }
- FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
- *TTI, TLI);
- if (FMACost.isValid()) {
- // Calculate actual FMAD cost.
- IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
- {RVecTy, RVecTy, RVecTy}, FMF);
- FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
-
- LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
- // Also, exclude vector fmul cost.
- InstructionCost FMulCost = TTI->getArithmeticInstrCost(
- Instruction::FMul, RVecTy, CostKind);
- LLVM_DEBUG(dbgs()
- << "Minus vector FMul cost: " << FMulCost << "\n");
- FMACost -= FMulCost;
- }
- }
- if (FMACost.isValid())
- VectorCost += FMACost;
- else
- VectorCost +=
- TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
+ VectorCost +=
+ TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
if (RType != RedTy) {
unsigned Opcode = Instruction::Trunc;
if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
@@ -25358,7 +25304,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
HorizontalReduction HorRdx;
if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
return nullptr;
- return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
+ return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
};
auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -25503,7 +25449,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (RedCost >= ScalarCost)
return false;
- return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
+ return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
};
if (Candidates.size() == 1)
return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
index 9d2e22bb454e4..9cc6b8739b20f 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll
@@ -709,25 +709,34 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
define double @dot_product_fp64(ptr %a, ptr %b) {
-; CHECK-LABEL: @dot_product_fp64(
-; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
-; CHECK-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
-; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
-; CHECK-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
-; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
-; CHECK-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
-; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
-; CHECK-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
-; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
-; CHECK-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
-; CHECK-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
-; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
-; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
-; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
-; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
-; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
-; CHECK-NEXT: ret double [[ADD_1]]
+; NON-POW2-LABEL: @dot_product_fp64(
+; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
+; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4
+; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4
+; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]]
+; NON-POW2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
+; NON-POW2-NEXT: ret double [[TMP4]]
+;
+; POW2-ONLY-LABEL: @dot_product_fp64(
+; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
+; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
+; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
+; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
+; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
+; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
+; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
+; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
+; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
+; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
+; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
+; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
+; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
+; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
+; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
+; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
+; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
+; POW2-ONLY-NEXT: ret double [[ADD_1]]
;
%gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
%l.a.0 = load double, ptr %gep.a.0, align 4
@@ -784,13 +793,21 @@ entry:
}
define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) {
-; CHECK-LABEL: @reduce_fadd_after_fmul_of_buildvec(
-; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
-; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
-; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
-; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
-; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
-; CHECK-NEXT: ret float [[ADD_1]]
+; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec(
+; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0
+; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1
+; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2
+; NON-POW2-NEXT: [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01)
+; NON-POW2-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]])
+; NON-POW2-NEXT: ret float [[TMP5]]
+;
+; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec(
+; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
+; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
+; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
+; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
+; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
+; POW2-ONLY-NEXT: ret float [[ADD_1]]
;
%mul.0 = fmul fast float %a, 10.0
%mul.1 = fmul fast float %b, 10.0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
index f921278cdecf3..1922e935cee4b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll
@@ -10,24 +10,19 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
-; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8
-; CHECK-NEXT: [[GEP2_4:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20
-; CHECK-NEXT: [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x double> poison)
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT: [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
-; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]]
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP4]])
-; CHECK-NEXT: [[TMP8:%.*]] = load <4 x double>, ptr [[GEP2_4]], align 8
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
-; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[TMP8]], [[TMP9]]
-; CHECK-NEXT: [[RDX_OP:%.*]] = fadd fast <4 x double> [[TMP6]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[RDX_OP]])
-; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
-; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x double> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
+; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]]
+; CHECK-NEXT: [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]]
+; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP7]])
+; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP5]])
+; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
+; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
index 8b65461028fb1..82fb5a46fee7c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=SSE4
; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX
-; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX2
+; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX
; This test checks for a case when a horizontal reduction of floating-point
; adds may look profitable, but is not because it eliminates generation of
@@ -26,27 +26,13 @@ define void @hr() {
; AVX: loop:
; AVX-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ]
; AVX-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double
-; AVX-NEXT: [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]]
-; AVX-NEXT: [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]]
-; AVX-NEXT: [[ADD1:%.*]] = fadd fast double 0.000000e+00, [[ADD0]]
-; AVX-NEXT: [[ADD2:%.*]] = fadd fast double 0.000000e+00, [[ADD1]]
-; AVX-NEXT: [[ADD3]] = fadd fast double 0.000000e+00, [[ADD2]]
+; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
+; AVX-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
+; AVX-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
+; AVX-NEXT: [[ADD3]] = fadd fast double [[TMP3]], [[PHI0]]
; AVX-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]]
; AVX: exit:
; AVX-NEXT: ret void
-;
-; AVX2-LABEL: @hr(
-; AVX2-NEXT: br label [[LOOP:%.*]]
-; AVX2: loop:
-; AVX2-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
-; AVX2-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double
-; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
-; AVX2-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
-; AVX2-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
-; AVX2-NEXT: [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
-; AVX2-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]]
-; AVX2: exit:
-; AVX2-NEXT: ret void
;
br label %loop
@@ -84,24 +70,12 @@ define double @hr_or_mul() {
;
; AVX-LABEL: @hr_or_mul(
; AVX-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double
-; AVX-NEXT: [[TMP4:%.*]] = fmul fast double 7.000000e+00, [[CVT0]]
+; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
+; AVX-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
+; AVX-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
; AVX-NEXT: [[ADD3:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
-; AVX-NEXT: [[MUL1:%.*]] = fmul fast double -4.300000e+01, [[CVT0]]
-; AVX-NEXT: [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD3]]
-; AVX-NEXT: [[MUL2:%.*]] = fmul fast double 2.200000e-02, [[CVT0]]
-; AVX-NEXT: [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]]
-; AVX-NEXT: [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]]
-; AVX-NEXT: [[ADD4:%.*]] = fadd fast double [[MUL3]], [[ADD2]]
-; AVX-NEXT: ret double [[ADD4]]
-;
-; AVX2-LABEL: @hr_or_mul(
-; AVX2-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double
-; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
-; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
-; AVX2-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
-; AVX2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
-; AVX2-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
-; AVX2-NEXT: ret double [[OP_RDX]]
+; AVX-NEXT: ret double [[ADD3]]
;
%cvt0 = uitofp i16 3 to double
%mul0 = fmul fast double 7.000000e+00, %cvt0
More information about the llvm-commits
mailing list