[llvm-branch-commits] [llvm] [AArch64] Decompose FADDV with known zero elements (PR #167313)
Guy David via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Nov 10 05:35:45 PST 2025
https://github.com/guy-david created https://github.com/llvm/llvm-project/pull/167313
FADDV is matched into FADDPv4f32 + FADDPv2i32p but this can be relaxed when one element (usually the 4th) or more are known to be zero.
Before:
```
movi d1, #0000000000000000
mov v0.s[3], v1.s[0]
faddp v0.4s, v0.4s, v0.4s
faddp s0, v0.2s
```
After:
```
mov s1, v0.s[2]
faddp s0, v0.2s
fadd s0, s0, s1
```
When all of the elements are zero, the intrinsic now simply reduces into a constant instead of emitting two additions.
>From 7a1406efe148f888a784851bd4268d227041d588 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Sun, 9 Nov 2025 17:55:53 +0200
Subject: [PATCH] [AArch64] Decompose faddv with known zero elements
FADDV is matched into FADDPv4f32 + FADDPv2i32p but this can be relaxed
when one element (usually the 4th) or more are known to be zero.
Before:
movi d1, #0000000000000000
mov v0.s[3], v1.s[0]
faddp v0.4s, v0.4s, v0.4s
faddp s0, v0.2s
After:
mov s1, v0.s[2]
faddp s0, v0.2s
fadd s0, s0, s1
---
.../Target/AArch64/AArch64ISelLowering.cpp | 58 +++++++++++++
llvm/test/CodeGen/AArch64/faddv.ll | 82 +++++++++++++++++++
2 files changed, 140 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/faddv.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 132afc27135e9..b4bf97e27bca4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22790,6 +22790,62 @@ static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
}
}
+/// Optimize patterns where we insert zeros into vector lanes before faddv.
+static SDValue tryCombineFADDVWithZero(SDNode *N, SelectionDAG &DAG) {
+ assert(getIntrinsicID(N) == Intrinsic::aarch64_neon_faddv &&
+ "Expected NEON faddv intrinsic");
+ SDLoc DL(N);
+ SDValue Vec = N->getOperand(1);
+ EVT VT = Vec.getValueType();
+ EVT EltVT = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ APInt DemandedElts = APInt::getAllOnes(NumElts);
+ APInt KnownZeroElts = DAG.computeVectorKnownZeroElements(Vec, DemandedElts);
+ unsigned NumZeroElts = KnownZeroElts.popcount();
+ // No element is known to be +0.0, fallback to the TableGen pattern.
+ if (NumZeroElts == 0)
+ return SDValue();
+ // All elements are +0.0, just return zero.
+ if (NumZeroElts == NumElts)
+ return DAG.getConstantFP(0.0, DL, EltVT);
+
+ // At least one element is +0.0, so it is worth to decompose the reduction
+ // into fadd's. FADDV is a pairwise reduction, so we need to respect the
+ // order of the elements in the vector.
+
+ // Check if we can output a signed zero.
+ // This avoid the scenario where all the added values are -0.0 except the +0.0
+ // element we chose to ignore.
+ SDNodeFlags Flags = N->getFlags();
+ bool IsSignedZeroSafe = Flags.hasNoSignedZeros() ||
+ DAG.allUsesSignedZeroInsensitive(SDValue(N, 0));
+ if (!IsSignedZeroSafe)
+ return SDValue();
+
+ // Extract all elements.
+ SmallVector<SDValue, 4> Elts;
+ for (unsigned I = 0; I < NumElts; I++) {
+ Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec,
+ DAG.getConstant(I, DL, MVT::i64)));
+ }
+ // Perform pairwise reduction.
+ while (Elts.size() > 1) {
+ SmallVector<SDValue, 2> NewElts;
+ for (unsigned I = 0; I < Elts.size(); I += 2) {
+ if (!KnownZeroElts[I] && !KnownZeroElts[I + 1]) {
+ NewElts.push_back(
+ DAG.getNode(ISD::FADD, DL, EltVT, Elts[I], Elts[I + 1]));
+ } else if (KnownZeroElts[I]) {
+ NewElts.push_back(Elts[I + 1]);
+ } else if (KnownZeroElts[I + 1]) {
+ NewElts.push_back(Elts[I]);
+ }
+ }
+ Elts = std::move(NewElts);
+ }
+ return Elts[0];
+}
+
static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -22813,6 +22869,8 @@ static SDValue performIntrinsicCombine(SDNode *N,
return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
case Intrinsic::aarch64_neon_umaxv:
return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
+ case Intrinsic::aarch64_neon_faddv:
+ return tryCombineFADDVWithZero(N, DAG);
case Intrinsic::aarch64_neon_fmax:
return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
diff --git a/llvm/test/CodeGen/AArch64/faddv.ll b/llvm/test/CodeGen/AArch64/faddv.ll
new file mode 100644
index 0000000000000..e4a3781150cf7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/faddv.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+; Test element at index 0 is zero.
+define float @test_v2f32_element_0_zero(<2 x float> %vec) {
+; CHECK-LABEL: test_v2f32_element_0_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: mov s0, v0.s[1]
+; CHECK-NEXT: ret
+entry:
+ %with_zero = insertelement <2 x float> %vec, float 0.0, i64 0
+ %sum = call nsz float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %with_zero)
+ ret float %sum
+}
+
+; Test element at index 3 is zero.
+define float @test_v4f32_element_3_zero(<4 x float> %vec) {
+; CHECK-LABEL: test_v4f32_element_3_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov s1, v0.s[2]
+; CHECK-NEXT: faddp s0, v0.2s
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: fabs s0, s0
+; CHECK-NEXT: ret
+entry:
+ %with_zero = insertelement <4 x float> %vec, float 0.0, i64 3
+ %sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %with_zero)
+ %abs = call float @llvm.fabs.f32(float %sum)
+ ret float %abs
+}
+
+; Test elements at index 0 and 2 are zero.
+define float @test_v4f32_elements_0_2_zero(<4 x float> %vec) {
+; CHECK-LABEL: test_v4f32_elements_0_2_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov s0, v0.s[3]
+; CHECK-NEXT: fabs s0, s0
+; CHECK-NEXT: ret
+entry:
+ %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
+ %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 2
+ %sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %zero2)
+ %abs = call float @llvm.fabs.f32(float %sum)
+ ret float %abs
+}
+
+; Test all elements are zero.
+define float @test_v4f32_all_zero(<4 x float> %vec) {
+; CHECK-LABEL: test_v4f32_all_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+entry:
+ %zero1 = insertelement <4 x float> %vec, float 0.0, i64 0
+ %zero2 = insertelement <4 x float> %zero1, float 0.0, i64 1
+ %zero3 = insertelement <4 x float> %zero2, float 0.0, i64 2
+ %zero4 = insertelement <4 x float> %zero3, float 0.0, i64 3
+ %sum = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %zero4)
+ ret float %sum
+}
+
+; Test element at index 0 is zero.
+define double @test_v2f64_element_0_zero(<2 x double> %vec) {
+; CHECK-LABEL: test_v2f64_element_0_zero:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov d0, v0.d[1]
+; CHECK-NEXT: fabs d0, d0
+; CHECK-NEXT: ret
+entry:
+ %with_zero = insertelement <2 x double> %vec, double 0.0, i64 0
+ %sum = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %with_zero)
+ %abs = call double @llvm.fabs.f64(double %sum)
+ ret double %abs
+}
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+
+declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double>)
More information about the llvm-branch-commits
mailing list