[llvm] [InstCombine] Fold ordered fadd reductions to `llvm.vector.reduce.fadd` (PR #180196)

Fri Feb 6 06:27:11 PST 2026

https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/180196

This pattern is intended to match trivial cases such as:

```
float16_t reduce_f16(float16x8_t a) {
  return a[0] + a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7];
}
```

Which are currently handled poorly by SLP (at -O3) as it requires reductions to be associative:  https://godbolt.org/z/rPoY4xahs

>From d9ee97b78639c369f099f17809c907c10078dac7 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 6 Feb 2026 14:12:20 +0000
Subject: [PATCH 1/2] Precommit tests

---
 .../InstCombine/fp-vector-reduce.ll           | 204 ++++++++++++++++++
 1 file changed, 204 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/fp-vector-reduce.ll

diff --git a/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll b/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll
new file mode 100644
index 0000000000000..53dee189d758d
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define half @fp16_reduce(<8 x half> %a) {
+; CHECK-LABEL: define half @fp16_reduce(
+; CHECK-SAME: <8 x half> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <8 x half> [[A]], i64 0
+; CHECK-NEXT:    [[CONV:%.*]] = fpext half [[VECEXT]] to float
+; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <8 x half> [[A]], i64 1
+; CHECK-NEXT:    [[CONV2:%.*]] = fpext half [[VECEXT1]] to float
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[CONV]], [[CONV2]]
+; CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <8 x half> [[A]], i64 2
+; CHECK-NEXT:    [[CONV4:%.*]] = fpext half [[VECEXT3]] to float
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd float [[ADD]], [[CONV4]]
+; CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <8 x half> [[A]], i64 3
+; CHECK-NEXT:    [[CONV7:%.*]] = fpext half [[VECEXT6]] to float
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd float [[ADD5]], [[CONV7]]
+; CHECK-NEXT:    [[VECEXT9:%.*]] = extractelement <8 x half> [[A]], i64 4
+; CHECK-NEXT:    [[CONV10:%.*]] = fpext half [[VECEXT9]] to float
+; CHECK-NEXT:    [[ADD11:%.*]] = fadd float [[ADD8]], [[CONV10]]
+; CHECK-NEXT:    [[VECEXT12:%.*]] = extractelement <8 x half> [[A]], i64 5
+; CHECK-NEXT:    [[CONV13:%.*]] = fpext half [[VECEXT12]] to float
+; CHECK-NEXT:    [[ADD14:%.*]] = fadd float [[ADD11]], [[CONV13]]
+; CHECK-NEXT:    [[VECEXT15:%.*]] = extractelement <8 x half> [[A]], i64 6
+; CHECK-NEXT:    [[CONV16:%.*]] = fpext half [[VECEXT15]] to float
+; CHECK-NEXT:    [[ADD17:%.*]] = fadd float [[ADD14]], [[CONV16]]
+; CHECK-NEXT:    [[VECEXT18:%.*]] = extractelement <8 x half> [[A]], i64 7
+; CHECK-NEXT:    [[CONV19:%.*]] = fpext half [[VECEXT18]] to float
+; CHECK-NEXT:    [[ADD20:%.*]] = fadd float [[ADD17]], [[CONV19]]
+; CHECK-NEXT:    [[CONV21:%.*]] = fptrunc float [[ADD20]] to half
+; CHECK-NEXT:    ret half [[CONV21]]
+;
+entry:
+  %vecext = extractelement <8 x half> %a, i64 0
+  %conv = fpext half %vecext to float
+  %vecext1 = extractelement <8 x half> %a, i64 1
+  %conv2 = fpext half %vecext1 to float
+  %add = fadd float %conv, %conv2
+  %vecext3 = extractelement <8 x half> %a, i64 2
+  %conv4 = fpext half %vecext3 to float
+  %add5 = fadd float %add, %conv4
+  %vecext6 = extractelement <8 x half> %a, i64 3
+  %conv7 = fpext half %vecext6 to float
+  %add8 = fadd float %add5, %conv7
+  %vecext9 = extractelement <8 x half> %a, i64 4
+  %conv10 = fpext half %vecext9 to float
+  %add11 = fadd float %add8, %conv10
+  %vecext12 = extractelement <8 x half> %a, i64 5
+  %conv13 = fpext half %vecext12 to float
+  %add14 = fadd float %add11, %conv13
+  %vecext15 = extractelement <8 x half> %a, i64 6
+  %conv16 = fpext half %vecext15 to float
+  %add17 = fadd float %add14, %conv16
+  %vecext18 = extractelement <8 x half> %a, i64 7
+  %conv19 = fpext half %vecext18 to float
+  %add20 = fadd float %add17, %conv19
+  %conv21 = fptrunc float %add20 to half
+  ret half %conv21
+}
+
+define float @fp32_reduce(<4 x float> %a) {
+; CHECK-LABEL: define float @fp32_reduce(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
+; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd float [[ADD]], [[VECEXT3]]
+; CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT:    ret float [[ADD8]]
+;
+  %vecext = extractelement <4 x float> %a, i64 0
+  %vecext1 = extractelement <4 x float> %a, i64 1
+  %add = fadd float %vecext, %vecext1
+  %vecext3 = extractelement <4 x float> %a, i64 2
+  %add5 = fadd float %add, %vecext3
+  %vecext6 = extractelement <4 x float> %a, i64 3
+  %add8 = fadd float %add5, %vecext6
+  ret float %add8
+}
+
+define double @float64_reduce(<2 x double> %a) {
+; CHECK-LABEL: define double @float64_reduce(
+; CHECK-SAME: <2 x double> [[A:%.*]]) {
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x double> [[A]], i64 0
+; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <2 x double> [[A]], i64 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT:    ret double [[ADD]]
+;
+  %vecext = extractelement <2 x double> %a, i64 0
+  %vecext1 = extractelement <2 x double> %a, i64 1
+  %add = fadd double %vecext, %vecext1
+  ret double %add
+}
+
+define float @fp32_reduce_fast(<4 x float> %a) {
+; CHECK-LABEL: define float @fp32_reduce_fast(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
+; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD]], [[VECEXT3]]
+; CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT:    ret float [[ADD8]]
+;
+  %vecext = extractelement <4 x float> %a, i64 0
+  %vecext1 = extractelement <4 x float> %a, i64 1
+  %add = fadd fast float %vecext, %vecext1
+  %vecext3 = extractelement <4 x float> %a, i64 2
+  %add5 = fadd fast float %add, %vecext3
+  %vecext6 = extractelement <4 x float> %a, i64 3
+  %add8 = fadd fast float %add5, %vecext6
+  ret float %add8
+}
+
+define float @fp32_reduce_mixed_fmf(<4 x float> %a) {
+; CHECK-LABEL: define float @fp32_reduce_mixed_fmf(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
+; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd nsz float [[ADD]], [[VECEXT3]]
+; CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT:    ret float [[ADD8]]
+;
+  %vecext = extractelement <4 x float> %a, i64 0
+  %vecext1 = extractelement <4 x float> %a, i64 1
+  %add = fadd fast float %vecext, %vecext1
+  %vecext3 = extractelement <4 x float> %a, i64 2
+  %add5 = fadd nsz float %add, %vecext3
+  %vecext6 = extractelement <4 x float> %a, i64 3
+  %add8 = fadd fast float %add5, %vecext6
+  ret float %add8
+}
+
+; Negative test: the lanes must be added in-order.
+define float @fp32_reduce_out_of_order(<4 x float> %a) {
+; CHECK-LABEL: define float @fp32_reduce_out_of_order(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
+; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 2
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 1
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd float [[ADD]], [[VECEXT3]]
+; CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT:    ret float [[ADD8]]
+;
+  %vecext = extractelement <4 x float> %a, i64 0
+  %vecext1 = extractelement <4 x float> %a, i64 2
+  %add = fadd float %vecext, %vecext1
+  %vecext3 = extractelement <4 x float> %a, i64 1
+  %add5 = fadd float %add, %vecext3
+  %vecext6 = extractelement <4 x float> %a, i64 3
+  %add8 = fadd float %add5, %vecext6
+  ret float %add8
+}
+
+define double @float64_reduce_multiple_use_of_last_add(<2 x double> %a) {
+; CHECK-LABEL: define double @float64_reduce_multiple_use_of_last_add(
+; CHECK-SAME: <2 x double> [[A:%.*]]) {
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x double> [[A]], i64 0
+; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <2 x double> [[A]], i64 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT:    [[SUB:%.*]] = fadd double [[ADD]], -1.000000e+00
+; CHECK-NEXT:    ret double [[SUB]]
+;
+  %vecext = extractelement <2 x double> %a, i64 0
+  %vecext1 = extractelement <2 x double> %a, i64 1
+  %add = fadd double %vecext, %vecext1
+  %sub = fsub double %add, 1.0
+  ret double %sub
+}
+
+; Negative test: multiple users of an intermediate value.
+define float @fp32_reduce_multiple_use_of_intermediate_value(<4 x float> %a) {
+; CHECK-LABEL: define float @fp32_reduce_multiple_use_of_intermediate_value(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
+; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd float [[ADD]], [[VECEXT3]]
+; CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
+; CHECK-NEXT:    [[ADD8:%.*]] = fadd float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT:    [[RES:%.*]] = fadd float [[ADD8]], [[ADD5]]
+; CHECK-NEXT:    ret float [[RES]]
+;
+  %vecext = extractelement <4 x float> %a, i64 0
+  %vecext1 = extractelement <4 x float> %a, i64 1
+  %add = fadd float %vecext, %vecext1
+  %vecext3 = extractelement <4 x float> %a, i64 2
+  %add5 = fadd float %add, %vecext3
+  %vecext6 = extractelement <4 x float> %a, i64 3
+  %add8 = fadd float %add5, %vecext6
+  %res = fadd float %add8, %add5
+  ret float %res
+}

>From 17f38ab6e380b0b7b0580ff808e7071caf328fa0 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 6 Feb 2026 14:24:35 +0000
Subject: [PATCH 2/2] [InstCombine] Fold ordered fadd reductions to
 `llvm.vector.reduce.fadd`

This pattern is intended to match trivial cases such as:

```
float16_t reduce_f16(float16x8_t a) {
  return a[0] + a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7];
}
```

Which are currently handled poorly by SLP (at -O3) as it requires
reductions to be associative:  https://godbolt.org/z/rPoY4xahs
---
 llvm/include/llvm/IR/PatternMatch.h           |  6 ++
 .../InstCombine/InstCombineAddSub.cpp         |  3 +
 .../InstCombine/InstCombineCasts.cpp          |  3 +
 .../InstCombine/InstCombineInternal.h         |  1 +
 .../InstCombine/InstCombineVectorOps.cpp      | 76 ++++++++++++++++++
 .../InstCombine/fp-vector-reduce.ll           | 79 +++++++------------
 6 files changed, 117 insertions(+), 51 deletions(-)

diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 3f56de3bf1fb0..ae6c9edbc1ff3 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2362,6 +2362,12 @@ inline CastInst_match<OpTy, FPExtInst> m_FPExt(const OpTy &Op) {
   return CastInst_match<OpTy, FPExtInst>(Op);
 }
 
+template <typename OpTy>
+inline match_combine_or<CastInst_match<OpTy, FPExtInst>, OpTy>
+m_FPExtOrSelf(const OpTy &Op) {
+  return m_CombineOr(m_FPExt(Op), Op);
+}
+
 //===----------------------------------------------------------------------===//
 // Matchers for control flow.
 //
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 8eeeccbc86523..bcbc638bd1c33 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1990,6 +1990,9 @@ static Instruction *factorizeFAddFSub(BinaryOperator &I,
 }
 
 Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
+  if (Instruction *R = foldOrderedFloatingPointReduction(&I))
+    return R;
+
   if (Value *V = simplifyFAddInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
                                   SQ.getWithInstruction(&I)))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 0cd2c09726a2d..144f612686e1f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2048,6 +2048,9 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
   if (Instruction *I = commonCastTransforms(FPT))
     return I;
 
+  if (Instruction *I = foldOrderedFloatingPointReduction(&FPT))
+    return I;
+
   // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to
   // simplify this expression to avoid one or more of the trunc/extend
   // operations if we can do so without changing the numerical results.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 8a478bf344536..4d00a6fb07bd3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -398,6 +398,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Instruction *matchSAddSubSat(IntrinsicInst &MinMax1);
   Instruction *foldNot(BinaryOperator &I);
   Instruction *foldBinOpOfDisplacedShifts(BinaryOperator &I);
+  Instruction *foldOrderedFloatingPointReduction(Instruction *I);
 
   /// Determine if a pair of casts can be replaced by a single cast.
   ///
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 3b034f6c37f66..d114c1c1f3c12 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -324,6 +324,82 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) {
   return nullptr;
 }
 
+/// Folds a chain of vector extracts + fadds into an ordered floating-point
+/// reduction. This is intended to match trivial cases SLP currently misses
+/// without fast-math, which can occur from common patterns in user code (like
+/// `vec[0] + vec[1] + vec[2] + vec[3]`).
+///
+///  The reduction can include fpexts and a final fptrunc:
+///  ```
+///  %elt.0 = extractelement <8 x half> %vec, i64 0
+///  %ext.elt.0 = fpext half %elt.0 to float
+///  %acc.1 = fadd float %ext.elt.0, ...
+///  ...
+///  %elt.7 = extractelement <8 x half> %vec, i64 7
+///  %ext.elt.7 = fpext half %elt.7 to float
+///  %acc.7 = fadd float %acc.6, %ext.elt.7
+///  %result = fptrunc float %acc.7 to half
+///  ```
+///
+///  Or operate directly on the type:
+///  ```
+///  %elt.0 = extractelement <4 x float> %vec, i64 0
+///  %acc.1 = fadd float %ext.0, ...
+///  ...
+///  %elt.3 = extractelement <4 x float> %vec, i64 3
+///  %result = fadd float %acc.2, %elt.3
+///  ```
+///
+/// TODO: Support reductions other than fadd? (fmin/max)
+/// TODO: Support out-of-order extracts with an additional shufflevector?
+Instruction *
+InstCombinerImpl::foldOrderedFloatingPointReduction(Instruction *I) {
+  Value *Root = I;
+  if (auto *FPTrunc = dyn_cast<FPTruncInst>(Root))
+    Root = FPTrunc->getOperand(0);
+
+  // Match the final fadd in the chain.
+  Value *Vec, *Acc;
+  uint64_t ExtractIdx;
+  if (!match(Root, m_FAdd(m_Value(Acc),
+                             m_FPExtOrSelf(m_ExtractElt(
+                                 m_Value(Vec), m_ConstantInt(ExtractIdx))))))
+    return nullptr;
+
+  auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+  if (!VecTy || I->getType() != VecTy->getScalarType())
+    return nullptr;
+
+  unsigned NumElts = VecTy->getNumElements();
+  if (ExtractIdx != NumElts - 1)
+    return nullptr;
+
+  // Walk up all intermediate fadds until we find the first lane.
+  FastMathFlags FMF = cast<FPMathOperator>(Root)->getFastMathFlags();
+  for (int Idx = ExtractIdx - 1; Idx > 0; --Idx) {
+    Value *NextAcc;
+    if (!match(Acc,
+               m_OneUse(m_FAdd(m_Value(NextAcc),
+                               m_FPExtOrSelf(m_ExtractElt(
+                                   m_Specific(Vec), m_SpecificInt(Idx)))))))
+      return nullptr;
+
+    FMF &= cast<FPMathOperator>(Acc)->getFastMathFlags();
+    Acc = NextAcc;
+  }
+
+  // Check the start value is the first element.
+  if (!match(Acc,
+             m_FPExtOrSelf(m_ExtractElt(m_Specific(Vec), m_SpecificInt(0)))))
+    return nullptr;
+
+  // Create the reduction propagating common fast-math flags.
+  auto *Reduce = Builder.CreateIntrinsic(
+      Intrinsic::vector_reduce_fadd, {VecTy},
+      {ConstantFP::get(VecTy->getScalarType(), -0.0), Vec}, FMF);
+  return replaceInstUsesWith(*I, Reduce);
+}
+
 /// Find elements of V demanded by UserInstr. If returns false, we were not able
 /// to determine all elements.
 static bool findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr,
diff --git a/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll b/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll
index 53dee189d758d..cad877d387d8e 100644
--- a/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll
+++ b/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll
@@ -5,30 +5,7 @@ define half @fp16_reduce(<8 x half> %a) {
 ; CHECK-LABEL: define half @fp16_reduce(
 ; CHECK-SAME: <8 x half> [[A:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <8 x half> [[A]], i64 0
-; CHECK-NEXT:    [[CONV:%.*]] = fpext half [[VECEXT]] to float
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <8 x half> [[A]], i64 1
-; CHECK-NEXT:    [[CONV2:%.*]] = fpext half [[VECEXT1]] to float
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[CONV]], [[CONV2]]
-; CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <8 x half> [[A]], i64 2
-; CHECK-NEXT:    [[CONV4:%.*]] = fpext half [[VECEXT3]] to float
-; CHECK-NEXT:    [[ADD5:%.*]] = fadd float [[ADD]], [[CONV4]]
-; CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <8 x half> [[A]], i64 3
-; CHECK-NEXT:    [[CONV7:%.*]] = fpext half [[VECEXT6]] to float
-; CHECK-NEXT:    [[ADD8:%.*]] = fadd float [[ADD5]], [[CONV7]]
-; CHECK-NEXT:    [[VECEXT9:%.*]] = extractelement <8 x half> [[A]], i64 4
-; CHECK-NEXT:    [[CONV10:%.*]] = fpext half [[VECEXT9]] to float
-; CHECK-NEXT:    [[ADD11:%.*]] = fadd float [[ADD8]], [[CONV10]]
-; CHECK-NEXT:    [[VECEXT12:%.*]] = extractelement <8 x half> [[A]], i64 5
-; CHECK-NEXT:    [[CONV13:%.*]] = fpext half [[VECEXT12]] to float
-; CHECK-NEXT:    [[ADD14:%.*]] = fadd float [[ADD11]], [[CONV13]]
-; CHECK-NEXT:    [[VECEXT15:%.*]] = extractelement <8 x half> [[A]], i64 6
-; CHECK-NEXT:    [[CONV16:%.*]] = fpext half [[VECEXT15]] to float
-; CHECK-NEXT:    [[ADD17:%.*]] = fadd float [[ADD14]], [[CONV16]]
-; CHECK-NEXT:    [[VECEXT18:%.*]] = extractelement <8 x half> [[A]], i64 7
-; CHECK-NEXT:    [[CONV19:%.*]] = fpext half [[VECEXT18]] to float
-; CHECK-NEXT:    [[ADD20:%.*]] = fadd float [[ADD17]], [[CONV19]]
-; CHECK-NEXT:    [[CONV21:%.*]] = fptrunc float [[ADD20]] to half
+; CHECK-NEXT:    [[CONV21:%.*]] = call half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[A]])
 ; CHECK-NEXT:    ret half [[CONV21]]
 ;
 entry:
@@ -62,13 +39,7 @@ entry:
 define float @fp32_reduce(<4 x float> %a) {
 ; CHECK-LABEL: define float @fp32_reduce(
 ; CHECK-SAME: <4 x float> [[A:%.*]]) {
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT1]]
-; CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
-; CHECK-NEXT:    [[ADD5:%.*]] = fadd float [[ADD]], [[VECEXT3]]
-; CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
-; CHECK-NEXT:    [[ADD8:%.*]] = fadd float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT:    [[ADD8:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[A]])
 ; CHECK-NEXT:    ret float [[ADD8]]
 ;
   %vecext = extractelement <4 x float> %a, i64 0
@@ -84,9 +55,7 @@ define float @fp32_reduce(<4 x float> %a) {
 define double @float64_reduce(<2 x double> %a) {
 ; CHECK-LABEL: define double @float64_reduce(
 ; CHECK-SAME: <2 x double> [[A:%.*]]) {
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x double> [[A]], i64 0
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <2 x double> [[A]], i64 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT:    [[ADD:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[A]])
 ; CHECK-NEXT:    ret double [[ADD]]
 ;
   %vecext = extractelement <2 x double> %a, i64 0
@@ -98,13 +67,7 @@ define double @float64_reduce(<2 x double> %a) {
 define float @fp32_reduce_fast(<4 x float> %a) {
 ; CHECK-LABEL: define float @fp32_reduce_fast(
 ; CHECK-SAME: <4 x float> [[A:%.*]]) {
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[VECEXT]], [[VECEXT1]]
-; CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
-; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD]], [[VECEXT3]]
-; CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
-; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT:    [[ADD8:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[A]])
 ; CHECK-NEXT:    ret float [[ADD8]]
 ;
   %vecext = extractelement <4 x float> %a, i64 0
@@ -120,13 +83,7 @@ define float @fp32_reduce_fast(<4 x float> %a) {
 define float @fp32_reduce_mixed_fmf(<4 x float> %a) {
 ; CHECK-LABEL: define float @fp32_reduce_mixed_fmf(
 ; CHECK-SAME: <4 x float> [[A:%.*]]) {
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[VECEXT]], [[VECEXT1]]
-; CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
-; CHECK-NEXT:    [[ADD5:%.*]] = fadd nsz float [[ADD]], [[VECEXT3]]
-; CHECK-NEXT:    [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
-; CHECK-NEXT:    [[ADD8:%.*]] = fadd fast float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT:    [[ADD8:%.*]] = call nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[A]])
 ; CHECK-NEXT:    ret float [[ADD8]]
 ;
   %vecext = extractelement <4 x float> %a, i64 0
@@ -165,9 +122,7 @@ define float @fp32_reduce_out_of_order(<4 x float> %a) {
 define double @float64_reduce_multiple_use_of_last_add(<2 x double> %a) {
 ; CHECK-LABEL: define double @float64_reduce_multiple_use_of_last_add(
 ; CHECK-SAME: <2 x double> [[A:%.*]]) {
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x double> [[A]], i64 0
-; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <2 x double> [[A]], i64 1
-; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT:    [[ADD:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[A]])
 ; CHECK-NEXT:    [[SUB:%.*]] = fadd double [[ADD]], -1.000000e+00
 ; CHECK-NEXT:    ret double [[SUB]]
 ;
@@ -202,3 +157,25 @@ define float @fp32_reduce_multiple_use_of_intermediate_value(<4 x float> %a) {
   %res = fadd float %add8, %add5
   ret float %res
 }
+
+; Negative test: final fptrunc does not match element type
+define float @fp16_reduce_mismatched_fptruc(<8 x half> %a) {
+; CHECK-LABEL: define float @fp16_reduce_mismatched_fptruc(
+; CHECK-SAME: <8 x half> [[A:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <8 x half> [[A]], i64 0
+; CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <8 x half> [[A]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = fpext half [[VECEXT]] to float
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext half [[VECEXT1]] to float
+; CHECK-NEXT:    [[RES:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    ret float [[RES]]
+;
+entry:
+  %vecext = extractelement <8 x half> %a, i64 0
+  %conv = fpext half %vecext to double
+  %vecext1 = extractelement <8 x half> %a, i64 1
+  %conv2 = fpext half %vecext1 to double
+  %add = fadd double %conv, %conv2
+  %res = fptrunc double %add to float
+  ret float %res
+}