[llvm] [InstCombine] Fold ordered fadd reductions to `llvm.vector.reduce.fadd` (PR #180196)
Benjamin Maxwell via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 6 06:27:11 PST 2026
https://github.com/MacDue created https://github.com/llvm/llvm-project/pull/180196
This pattern is intended to match trivial cases such as:
```
float16_t reduce_f16(float16x8_t a) {
return a[0] + a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7];
}
```
Which are currently handled poorly by SLP (at -O3) as it requires reductions to be associative: https://godbolt.org/z/rPoY4xahs
>From d9ee97b78639c369f099f17809c907c10078dac7 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 6 Feb 2026 14:12:20 +0000
Subject: [PATCH 1/2] Precommit tests
---
.../InstCombine/fp-vector-reduce.ll | 204 ++++++++++++++++++
1 file changed, 204 insertions(+)
create mode 100644 llvm/test/Transforms/InstCombine/fp-vector-reduce.ll
diff --git a/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll b/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll
new file mode 100644
index 0000000000000..53dee189d758d
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define half @fp16_reduce(<8 x half> %a) {
+; CHECK-LABEL: define half @fp16_reduce(
+; CHECK-SAME: <8 x half> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <8 x half> [[A]], i64 0
+; CHECK-NEXT: [[CONV:%.*]] = fpext half [[VECEXT]] to float
+; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <8 x half> [[A]], i64 1
+; CHECK-NEXT: [[CONV2:%.*]] = fpext half [[VECEXT1]] to float
+; CHECK-NEXT: [[ADD:%.*]] = fadd float [[CONV]], [[CONV2]]
+; CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <8 x half> [[A]], i64 2
+; CHECK-NEXT: [[CONV4:%.*]] = fpext half [[VECEXT3]] to float
+; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[ADD]], [[CONV4]]
+; CHECK-NEXT: [[VECEXT6:%.*]] = extractelement <8 x half> [[A]], i64 3
+; CHECK-NEXT: [[CONV7:%.*]] = fpext half [[VECEXT6]] to float
+; CHECK-NEXT: [[ADD8:%.*]] = fadd float [[ADD5]], [[CONV7]]
+; CHECK-NEXT: [[VECEXT9:%.*]] = extractelement <8 x half> [[A]], i64 4
+; CHECK-NEXT: [[CONV10:%.*]] = fpext half [[VECEXT9]] to float
+; CHECK-NEXT: [[ADD11:%.*]] = fadd float [[ADD8]], [[CONV10]]
+; CHECK-NEXT: [[VECEXT12:%.*]] = extractelement <8 x half> [[A]], i64 5
+; CHECK-NEXT: [[CONV13:%.*]] = fpext half [[VECEXT12]] to float
+; CHECK-NEXT: [[ADD14:%.*]] = fadd float [[ADD11]], [[CONV13]]
+; CHECK-NEXT: [[VECEXT15:%.*]] = extractelement <8 x half> [[A]], i64 6
+; CHECK-NEXT: [[CONV16:%.*]] = fpext half [[VECEXT15]] to float
+; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD14]], [[CONV16]]
+; CHECK-NEXT: [[VECEXT18:%.*]] = extractelement <8 x half> [[A]], i64 7
+; CHECK-NEXT: [[CONV19:%.*]] = fpext half [[VECEXT18]] to float
+; CHECK-NEXT: [[ADD20:%.*]] = fadd float [[ADD17]], [[CONV19]]
+; CHECK-NEXT: [[CONV21:%.*]] = fptrunc float [[ADD20]] to half
+; CHECK-NEXT: ret half [[CONV21]]
+;
+entry:
+ %vecext = extractelement <8 x half> %a, i64 0
+ %conv = fpext half %vecext to float
+ %vecext1 = extractelement <8 x half> %a, i64 1
+ %conv2 = fpext half %vecext1 to float
+ %add = fadd float %conv, %conv2
+ %vecext3 = extractelement <8 x half> %a, i64 2
+ %conv4 = fpext half %vecext3 to float
+ %add5 = fadd float %add, %conv4
+ %vecext6 = extractelement <8 x half> %a, i64 3
+ %conv7 = fpext half %vecext6 to float
+ %add8 = fadd float %add5, %conv7
+ %vecext9 = extractelement <8 x half> %a, i64 4
+ %conv10 = fpext half %vecext9 to float
+ %add11 = fadd float %add8, %conv10
+ %vecext12 = extractelement <8 x half> %a, i64 5
+ %conv13 = fpext half %vecext12 to float
+ %add14 = fadd float %add11, %conv13
+ %vecext15 = extractelement <8 x half> %a, i64 6
+ %conv16 = fpext half %vecext15 to float
+ %add17 = fadd float %add14, %conv16
+ %vecext18 = extractelement <8 x half> %a, i64 7
+ %conv19 = fpext half %vecext18 to float
+ %add20 = fadd float %add17, %conv19
+ %conv21 = fptrunc float %add20 to half
+ ret half %conv21
+}
+
+define float @fp32_reduce(<4 x float> %a) {
+; CHECK-LABEL: define float @fp32_reduce(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
+; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
+; CHECK-NEXT: [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
+; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[ADD]], [[VECEXT3]]
+; CHECK-NEXT: [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
+; CHECK-NEXT: [[ADD8:%.*]] = fadd float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT: ret float [[ADD8]]
+;
+ %vecext = extractelement <4 x float> %a, i64 0
+ %vecext1 = extractelement <4 x float> %a, i64 1
+ %add = fadd float %vecext, %vecext1
+ %vecext3 = extractelement <4 x float> %a, i64 2
+ %add5 = fadd float %add, %vecext3
+ %vecext6 = extractelement <4 x float> %a, i64 3
+ %add8 = fadd float %add5, %vecext6
+ ret float %add8
+}
+
+define double @float64_reduce(<2 x double> %a) {
+; CHECK-LABEL: define double @float64_reduce(
+; CHECK-SAME: <2 x double> [[A:%.*]]) {
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x double> [[A]], i64 0
+; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <2 x double> [[A]], i64 1
+; CHECK-NEXT: [[ADD:%.*]] = fadd double [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT: ret double [[ADD]]
+;
+ %vecext = extractelement <2 x double> %a, i64 0
+ %vecext1 = extractelement <2 x double> %a, i64 1
+ %add = fadd double %vecext, %vecext1
+ ret double %add
+}
+
+define float @fp32_reduce_fast(<4 x float> %a) {
+; CHECK-LABEL: define float @fp32_reduce_fast(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
+; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
+; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
+; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD]], [[VECEXT3]]
+; CHECK-NEXT: [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
+; CHECK-NEXT: [[ADD8:%.*]] = fadd fast float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT: ret float [[ADD8]]
+;
+ %vecext = extractelement <4 x float> %a, i64 0
+ %vecext1 = extractelement <4 x float> %a, i64 1
+ %add = fadd fast float %vecext, %vecext1
+ %vecext3 = extractelement <4 x float> %a, i64 2
+ %add5 = fadd fast float %add, %vecext3
+ %vecext6 = extractelement <4 x float> %a, i64 3
+ %add8 = fadd fast float %add5, %vecext6
+ ret float %add8
+}
+
+define float @fp32_reduce_mixed_fmf(<4 x float> %a) {
+; CHECK-LABEL: define float @fp32_reduce_mixed_fmf(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
+; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
+; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
+; CHECK-NEXT: [[ADD5:%.*]] = fadd nsz float [[ADD]], [[VECEXT3]]
+; CHECK-NEXT: [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
+; CHECK-NEXT: [[ADD8:%.*]] = fadd fast float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT: ret float [[ADD8]]
+;
+ %vecext = extractelement <4 x float> %a, i64 0
+ %vecext1 = extractelement <4 x float> %a, i64 1
+ %add = fadd fast float %vecext, %vecext1
+ %vecext3 = extractelement <4 x float> %a, i64 2
+ %add5 = fadd nsz float %add, %vecext3
+ %vecext6 = extractelement <4 x float> %a, i64 3
+ %add8 = fadd fast float %add5, %vecext6
+ ret float %add8
+}
+
+; Negative test: the lanes must be added in-order.
+define float @fp32_reduce_out_of_order(<4 x float> %a) {
+; CHECK-LABEL: define float @fp32_reduce_out_of_order(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
+; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 2
+; CHECK-NEXT: [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 1
+; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[ADD]], [[VECEXT3]]
+; CHECK-NEXT: [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
+; CHECK-NEXT: [[ADD8:%.*]] = fadd float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT: ret float [[ADD8]]
+;
+ %vecext = extractelement <4 x float> %a, i64 0
+ %vecext1 = extractelement <4 x float> %a, i64 2
+ %add = fadd float %vecext, %vecext1
+ %vecext3 = extractelement <4 x float> %a, i64 1
+ %add5 = fadd float %add, %vecext3
+ %vecext6 = extractelement <4 x float> %a, i64 3
+ %add8 = fadd float %add5, %vecext6
+ ret float %add8
+}
+
+define double @float64_reduce_multiple_use_of_last_add(<2 x double> %a) {
+; CHECK-LABEL: define double @float64_reduce_multiple_use_of_last_add(
+; CHECK-SAME: <2 x double> [[A:%.*]]) {
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x double> [[A]], i64 0
+; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <2 x double> [[A]], i64 1
+; CHECK-NEXT: [[ADD:%.*]] = fadd double [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT: [[SUB:%.*]] = fadd double [[ADD]], -1.000000e+00
+; CHECK-NEXT: ret double [[SUB]]
+;
+ %vecext = extractelement <2 x double> %a, i64 0
+ %vecext1 = extractelement <2 x double> %a, i64 1
+ %add = fadd double %vecext, %vecext1
+ %sub = fsub double %add, 1.0
+ ret double %sub
+}
+
+; Negative test: multiple users of an intermediate value.
+define float @fp32_reduce_multiple_use_of_intermediate_value(<4 x float> %a) {
+; CHECK-LABEL: define float @fp32_reduce_multiple_use_of_intermediate_value(
+; CHECK-SAME: <4 x float> [[A:%.*]]) {
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
+; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
+; CHECK-NEXT: [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
+; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[ADD]], [[VECEXT3]]
+; CHECK-NEXT: [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
+; CHECK-NEXT: [[ADD8:%.*]] = fadd float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[ADD8]], [[ADD5]]
+; CHECK-NEXT: ret float [[RES]]
+;
+ %vecext = extractelement <4 x float> %a, i64 0
+ %vecext1 = extractelement <4 x float> %a, i64 1
+ %add = fadd float %vecext, %vecext1
+ %vecext3 = extractelement <4 x float> %a, i64 2
+ %add5 = fadd float %add, %vecext3
+ %vecext6 = extractelement <4 x float> %a, i64 3
+ %add8 = fadd float %add5, %vecext6
+ %res = fadd float %add8, %add5
+ ret float %res
+}
>From 17f38ab6e380b0b7b0580ff808e7071caf328fa0 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 6 Feb 2026 14:24:35 +0000
Subject: [PATCH 2/2] [InstCombine] Fold ordered fadd reductions to
`llvm.vector.reduce.fadd`
This pattern is intended to match trivial cases such as:
```
float16_t reduce_f16(float16x8_t a) {
return a[0] + a[1] + a[2] + a[3] + a[4] + a[5] + a[6] + a[7];
}
```
Which are currently handled poorly by SLP (at -O3) as it requires
reductions to be associative: https://godbolt.org/z/rPoY4xahs
---
llvm/include/llvm/IR/PatternMatch.h | 6 ++
.../InstCombine/InstCombineAddSub.cpp | 3 +
.../InstCombine/InstCombineCasts.cpp | 3 +
.../InstCombine/InstCombineInternal.h | 1 +
.../InstCombine/InstCombineVectorOps.cpp | 76 ++++++++++++++++++
.../InstCombine/fp-vector-reduce.ll | 79 +++++++------------
6 files changed, 117 insertions(+), 51 deletions(-)
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 3f56de3bf1fb0..ae6c9edbc1ff3 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2362,6 +2362,12 @@ inline CastInst_match<OpTy, FPExtInst> m_FPExt(const OpTy &Op) {
return CastInst_match<OpTy, FPExtInst>(Op);
}
+template <typename OpTy>
+inline match_combine_or<CastInst_match<OpTy, FPExtInst>, OpTy>
+m_FPExtOrSelf(const OpTy &Op) {
+ return m_CombineOr(m_FPExt(Op), Op);
+}
+
//===----------------------------------------------------------------------===//
// Matchers for control flow.
//
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 8eeeccbc86523..bcbc638bd1c33 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1990,6 +1990,9 @@ static Instruction *factorizeFAddFSub(BinaryOperator &I,
}
Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
+ if (Instruction *R = foldOrderedFloatingPointReduction(&I))
+ return R;
+
if (Value *V = simplifyFAddInst(I.getOperand(0), I.getOperand(1),
I.getFastMathFlags(),
SQ.getWithInstruction(&I)))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 0cd2c09726a2d..144f612686e1f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -2048,6 +2048,9 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
if (Instruction *I = commonCastTransforms(FPT))
return I;
+ if (Instruction *I = foldOrderedFloatingPointReduction(&FPT))
+ return I;
+
// If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to
// simplify this expression to avoid one or more of the trunc/extend
// operations if we can do so without changing the numerical results.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 8a478bf344536..4d00a6fb07bd3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -398,6 +398,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
Instruction *matchSAddSubSat(IntrinsicInst &MinMax1);
Instruction *foldNot(BinaryOperator &I);
Instruction *foldBinOpOfDisplacedShifts(BinaryOperator &I);
+ Instruction *foldOrderedFloatingPointReduction(Instruction *I);
/// Determine if a pair of casts can be replaced by a single cast.
///
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 3b034f6c37f66..d114c1c1f3c12 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -324,6 +324,82 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) {
return nullptr;
}
+/// Folds a chain of vector extracts + fadds into an ordered floating-point
+/// reduction. This is intended to match trivial cases SLP currently misses
+/// without fast-math, which can occur from common patterns in user code (like
+/// `vec[0] + vec[1] + vec[2] + vec[3]`).
+///
+/// The reduction can include fpexts and a final fptrunc:
+/// ```
+/// %elt.0 = extractelement <8 x half> %vec, i64 0
+/// %ext.elt.0 = fpext half %elt.0 to float
+/// %acc.1 = fadd float %ext.elt.0, ...
+/// ...
+/// %elt.7 = extractelement <8 x half> %vec, i64 7
+/// %ext.elt.7 = fpext half %elt.7 to float
+/// %acc.7 = fadd float %acc.6, %ext.elt.7
+/// %result = fptrunc float %acc.7 to half
+/// ```
+///
+/// Or operate directly on the type:
+/// ```
+/// %elt.0 = extractelement <4 x float> %vec, i64 0
+/// %acc.1 = fadd float %ext.0, ...
+/// ...
+/// %elt.3 = extractelement <4 x float> %vec, i64 3
+/// %result = fadd float %acc.2, %elt.3
+/// ```
+///
+/// TODO: Support reductions other than fadd? (fmin/max)
+/// TODO: Support out-of-order extracts with an additional shufflevector?
+Instruction *
+InstCombinerImpl::foldOrderedFloatingPointReduction(Instruction *I) {
+ Value *Root = I;
+ if (auto *FPTrunc = dyn_cast<FPTruncInst>(Root))
+ Root = FPTrunc->getOperand(0);
+
+ // Match the final fadd in the chain.
+ Value *Vec, *Acc;
+ uint64_t ExtractIdx;
+ if (!match(Root, m_FAdd(m_Value(Acc),
+ m_FPExtOrSelf(m_ExtractElt(
+ m_Value(Vec), m_ConstantInt(ExtractIdx))))))
+ return nullptr;
+
+ auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+ if (!VecTy || I->getType() != VecTy->getScalarType())
+ return nullptr;
+
+ unsigned NumElts = VecTy->getNumElements();
+ if (ExtractIdx != NumElts - 1)
+ return nullptr;
+
+ // Walk up all intermediate fadds until we find the first lane.
+ FastMathFlags FMF = cast<FPMathOperator>(Root)->getFastMathFlags();
+ for (int Idx = ExtractIdx - 1; Idx > 0; --Idx) {
+ Value *NextAcc;
+ if (!match(Acc,
+ m_OneUse(m_FAdd(m_Value(NextAcc),
+ m_FPExtOrSelf(m_ExtractElt(
+ m_Specific(Vec), m_SpecificInt(Idx)))))))
+ return nullptr;
+
+ FMF &= cast<FPMathOperator>(Acc)->getFastMathFlags();
+ Acc = NextAcc;
+ }
+
+ // Check the start value is the first element.
+ if (!match(Acc,
+ m_FPExtOrSelf(m_ExtractElt(m_Specific(Vec), m_SpecificInt(0)))))
+ return nullptr;
+
+ // Create the reduction propagating common fast-math flags.
+ auto *Reduce = Builder.CreateIntrinsic(
+ Intrinsic::vector_reduce_fadd, {VecTy},
+ {ConstantFP::get(VecTy->getScalarType(), -0.0), Vec}, FMF);
+ return replaceInstUsesWith(*I, Reduce);
+}
+
/// Find elements of V demanded by UserInstr. If returns false, we were not able
/// to determine all elements.
static bool findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr,
diff --git a/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll b/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll
index 53dee189d758d..cad877d387d8e 100644
--- a/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll
+++ b/llvm/test/Transforms/InstCombine/fp-vector-reduce.ll
@@ -5,30 +5,7 @@ define half @fp16_reduce(<8 x half> %a) {
; CHECK-LABEL: define half @fp16_reduce(
; CHECK-SAME: <8 x half> [[A:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <8 x half> [[A]], i64 0
-; CHECK-NEXT: [[CONV:%.*]] = fpext half [[VECEXT]] to float
-; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <8 x half> [[A]], i64 1
-; CHECK-NEXT: [[CONV2:%.*]] = fpext half [[VECEXT1]] to float
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[CONV]], [[CONV2]]
-; CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <8 x half> [[A]], i64 2
-; CHECK-NEXT: [[CONV4:%.*]] = fpext half [[VECEXT3]] to float
-; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[ADD]], [[CONV4]]
-; CHECK-NEXT: [[VECEXT6:%.*]] = extractelement <8 x half> [[A]], i64 3
-; CHECK-NEXT: [[CONV7:%.*]] = fpext half [[VECEXT6]] to float
-; CHECK-NEXT: [[ADD8:%.*]] = fadd float [[ADD5]], [[CONV7]]
-; CHECK-NEXT: [[VECEXT9:%.*]] = extractelement <8 x half> [[A]], i64 4
-; CHECK-NEXT: [[CONV10:%.*]] = fpext half [[VECEXT9]] to float
-; CHECK-NEXT: [[ADD11:%.*]] = fadd float [[ADD8]], [[CONV10]]
-; CHECK-NEXT: [[VECEXT12:%.*]] = extractelement <8 x half> [[A]], i64 5
-; CHECK-NEXT: [[CONV13:%.*]] = fpext half [[VECEXT12]] to float
-; CHECK-NEXT: [[ADD14:%.*]] = fadd float [[ADD11]], [[CONV13]]
-; CHECK-NEXT: [[VECEXT15:%.*]] = extractelement <8 x half> [[A]], i64 6
-; CHECK-NEXT: [[CONV16:%.*]] = fpext half [[VECEXT15]] to float
-; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD14]], [[CONV16]]
-; CHECK-NEXT: [[VECEXT18:%.*]] = extractelement <8 x half> [[A]], i64 7
-; CHECK-NEXT: [[CONV19:%.*]] = fpext half [[VECEXT18]] to float
-; CHECK-NEXT: [[ADD20:%.*]] = fadd float [[ADD17]], [[CONV19]]
-; CHECK-NEXT: [[CONV21:%.*]] = fptrunc float [[ADD20]] to half
+; CHECK-NEXT: [[CONV21:%.*]] = call half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[A]])
; CHECK-NEXT: ret half [[CONV21]]
;
entry:
@@ -62,13 +39,7 @@ entry:
define float @fp32_reduce(<4 x float> %a) {
; CHECK-LABEL: define float @fp32_reduce(
; CHECK-SAME: <4 x float> [[A:%.*]]) {
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
-; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[VECEXT]], [[VECEXT1]]
-; CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
-; CHECK-NEXT: [[ADD5:%.*]] = fadd float [[ADD]], [[VECEXT3]]
-; CHECK-NEXT: [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
-; CHECK-NEXT: [[ADD8:%.*]] = fadd float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT: [[ADD8:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[A]])
; CHECK-NEXT: ret float [[ADD8]]
;
%vecext = extractelement <4 x float> %a, i64 0
@@ -84,9 +55,7 @@ define float @fp32_reduce(<4 x float> %a) {
define double @float64_reduce(<2 x double> %a) {
; CHECK-LABEL: define double @float64_reduce(
; CHECK-SAME: <2 x double> [[A:%.*]]) {
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x double> [[A]], i64 0
-; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <2 x double> [[A]], i64 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd double [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[A]])
; CHECK-NEXT: ret double [[ADD]]
;
%vecext = extractelement <2 x double> %a, i64 0
@@ -98,13 +67,7 @@ define double @float64_reduce(<2 x double> %a) {
define float @fp32_reduce_fast(<4 x float> %a) {
; CHECK-LABEL: define float @fp32_reduce_fast(
; CHECK-SAME: <4 x float> [[A:%.*]]) {
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
-; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[VECEXT]], [[VECEXT1]]
-; CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
-; CHECK-NEXT: [[ADD5:%.*]] = fadd fast float [[ADD]], [[VECEXT3]]
-; CHECK-NEXT: [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
-; CHECK-NEXT: [[ADD8:%.*]] = fadd fast float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT: [[ADD8:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[A]])
; CHECK-NEXT: ret float [[ADD8]]
;
%vecext = extractelement <4 x float> %a, i64 0
@@ -120,13 +83,7 @@ define float @fp32_reduce_fast(<4 x float> %a) {
define float @fp32_reduce_mixed_fmf(<4 x float> %a) {
; CHECK-LABEL: define float @fp32_reduce_mixed_fmf(
; CHECK-SAME: <4 x float> [[A:%.*]]) {
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[A]], i64 0
-; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x float> [[A]], i64 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[VECEXT]], [[VECEXT1]]
-; CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <4 x float> [[A]], i64 2
-; CHECK-NEXT: [[ADD5:%.*]] = fadd nsz float [[ADD]], [[VECEXT3]]
-; CHECK-NEXT: [[VECEXT6:%.*]] = extractelement <4 x float> [[A]], i64 3
-; CHECK-NEXT: [[ADD8:%.*]] = fadd fast float [[ADD5]], [[VECEXT6]]
+; CHECK-NEXT: [[ADD8:%.*]] = call nsz float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[A]])
; CHECK-NEXT: ret float [[ADD8]]
;
%vecext = extractelement <4 x float> %a, i64 0
@@ -165,9 +122,7 @@ define float @fp32_reduce_out_of_order(<4 x float> %a) {
define double @float64_reduce_multiple_use_of_last_add(<2 x double> %a) {
; CHECK-LABEL: define double @float64_reduce_multiple_use_of_last_add(
; CHECK-SAME: <2 x double> [[A:%.*]]) {
-; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x double> [[A]], i64 0
-; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <2 x double> [[A]], i64 1
-; CHECK-NEXT: [[ADD:%.*]] = fadd double [[VECEXT]], [[VECEXT1]]
+; CHECK-NEXT: [[ADD:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[A]])
; CHECK-NEXT: [[SUB:%.*]] = fadd double [[ADD]], -1.000000e+00
; CHECK-NEXT: ret double [[SUB]]
;
@@ -202,3 +157,25 @@ define float @fp32_reduce_multiple_use_of_intermediate_value(<4 x float> %a) {
%res = fadd float %add8, %add5
ret float %res
}
+
+; Negative test: final fptrunc does not match element type
+define float @fp16_reduce_mismatched_fptruc(<8 x half> %a) {
+; CHECK-LABEL: define float @fp16_reduce_mismatched_fptruc(
+; CHECK-SAME: <8 x half> [[A:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <8 x half> [[A]], i64 0
+; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <8 x half> [[A]], i64 1
+; CHECK-NEXT: [[TMP0:%.*]] = fpext half [[VECEXT]] to float
+; CHECK-NEXT: [[TMP1:%.*]] = fpext half [[VECEXT1]] to float
+; CHECK-NEXT: [[RES:%.*]] = fadd float [[TMP0]], [[TMP1]]
+; CHECK-NEXT: ret float [[RES]]
+;
+entry:
+ %vecext = extractelement <8 x half> %a, i64 0
+ %conv = fpext half %vecext to double
+ %vecext1 = extractelement <8 x half> %a, i64 1
+ %conv2 = fpext half %vecext1 to double
+ %add = fadd double %conv, %conv2
+ %res = fptrunc double %add to float
+ ret float %res
+}
More information about the llvm-commits
mailing list