[llvm] [DAGCombiner] Relax nsz constraint for FP optimizations (PR #165011)
Guy David via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 17 03:10:35 PST 2025
https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/165011
>From 3b586e042186aac40ececc6a1040835501caec3b Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Fri, 24 Oct 2025 19:30:19 +0300
Subject: [PATCH] [DAGCombiner] Relax nsz constraint for FP optimizations
Some floating-point optimization don't trigger because they can produce
incorrect results around signed zeros, and rely on the existence of the
nsz flag which commonly appears when fast-math is enabled.
However, this flag is not a hard requirement when all of the users of
the combined value are either guaranteed to overwrite the sign-bit or
simply ignore it (comparisons, etc.).
The optimizations affected:
- fadd x, +0.0 -> x
- fsub x, -0.0 -> x
- fsub +0.0, x -> fneg x
- fdiv(x, sqrt(x)) -> sqrt(x)
- frem lowering with power-of-2 divisors
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 6 ++
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 17 +++--
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 40 +++++++++++
.../CodeGen/AArch64/ignore-signed-zero.ll | 72 +++++++++++++++++++
.../AMDGPU/fcanonicalize-elimination.ll | 2 +-
llvm/test/CodeGen/AMDGPU/swdev380865.ll | 5 +-
6 files changed, 132 insertions(+), 10 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/ignore-signed-zero.ll
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index b024e8a68bd6e..9dba2ee8692f5 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -2326,6 +2326,12 @@ class SelectionDAG {
/// +nan are considered positive, -0.0, -inf and -nan are not.
LLVM_ABI bool cannotBeOrderedNegativeFP(SDValue Op) const;
+ /// Check if a use of a float value is insensitive to signed zeros.
+ LLVM_ABI bool canIgnoreSignBitOfZero(const SDUse &Use) const;
+
+ /// Check if at most two uses of a value are insensitive to signed zeros.
+ LLVM_ABI bool canIgnoreSignBitOfZero(SDValue Op) const;
+
/// Test whether two SDValues are known to compare equal. This
/// is true if they are the same value, or if one is negative zero and the
/// other positive zero.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c9513611e6dcb..c899d1288ad3c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17869,7 +17869,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
// N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math)
ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true);
if (N1C && N1C->isZero())
- if (N1C->isNegative() || Flags.hasNoSignedZeros())
+ if (N1C->isNegative() || Flags.hasNoSignedZeros() ||
+ DAG.canIgnoreSignBitOfZero(SDValue(N, 0)))
return N0;
if (SDValue NewSel = foldBinOpIntoSelect(N))
@@ -18081,7 +18082,8 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
// (fsub A, 0) -> A
if (N1CFP && N1CFP->isZero()) {
- if (!N1CFP->isNegative() || Flags.hasNoSignedZeros()) {
+ if (!N1CFP->isNegative() || Flags.hasNoSignedZeros() ||
+ DAG.canIgnoreSignBitOfZero(SDValue(N, 0))) {
return N0;
}
}
@@ -18094,7 +18096,8 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
// (fsub -0.0, N1) -> -N1
if (N0CFP && N0CFP->isZero()) {
- if (N0CFP->isNegative() || Flags.hasNoSignedZeros()) {
+ if (N0CFP->isNegative() || Flags.hasNoSignedZeros() ||
+ DAG.canIgnoreSignBitOfZero(SDValue(N, 0))) {
// We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are
// flushed to zero, unless all users treat denorms as zero (DAZ).
// FIXME: This transform will change the sign of a NaN and the behavior
@@ -18744,7 +18747,8 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
}
// Fold X/Sqrt(X) -> Sqrt(X)
- if (Flags.hasNoSignedZeros() && Flags.hasAllowReassociation())
+ if ((Flags.hasNoSignedZeros() || DAG.canIgnoreSignBitOfZero(SDValue(N, 0))) &&
+ Flags.hasAllowReassociation())
if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0))
return N1;
@@ -18795,8 +18799,9 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
TLI.isOperationLegalOrCustom(ISD::FDIV, VT) &&
TLI.isOperationLegalOrCustom(ISD::FTRUNC, VT) &&
DAG.isKnownToBeAPowerOfTwoFP(N1)) {
- bool NeedsCopySign =
- !Flags.hasNoSignedZeros() && !DAG.cannotBeOrderedNegativeFP(N0);
+ bool NeedsCopySign = !Flags.hasNoSignedZeros() &&
+ !DAG.canIgnoreSignBitOfZero(SDValue(N, 0)) &&
+ !DAG.cannotBeOrderedNegativeFP(N0);
SDValue Div = DAG.getNode(ISD::FDIV, DL, VT, N0, N1);
SDValue Rnd = DAG.getNode(ISD::FTRUNC, DL, VT, Div);
SDValue MLA;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c2b4c19846316..64fd925684ffa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6118,6 +6118,46 @@ bool SelectionDAG::cannotBeOrderedNegativeFP(SDValue Op) const {
llvm_unreachable("covered opcode switch");
}
+bool SelectionDAG::canIgnoreSignBitOfZero(const SDUse &Use) const {
+ assert(Use.getValueType().isFloatingPoint());
+ const SDNode *User = Use.getUser();
+ unsigned OperandNo = Use.getOperandNo();
+ // Check if this use is insensitive to the sign of zero
+ switch (User->getOpcode()) {
+ case ISD::SETCC:
+ // Comparisons: IEEE-754 specifies +0.0 == -0.0.
+ case ISD::FABS:
+ // fabs always produces +0.0.
+ return true;
+ case ISD::FCOPYSIGN:
+ // copysign overwrites the sign bit of the first operand.
+ return OperandNo == 0;
+ case ISD::FADD:
+ case ISD::FSUB: {
+ // Arithmetic with non-zero constants fixes the uncertainty around the
+ // sign bit.
+ SDValue Other = User->getOperand(1 - OperandNo);
+ return isKnownNeverZeroFloat(Other);
+ }
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ // fp-to-int conversions normalize signed zeros.
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SelectionDAG::canIgnoreSignBitOfZero(SDValue Op) const {
+ // FIXME: Limit the amount of checked uses to not introduce a compile-time
+ // regression. Ideally, this should be implemented as a demanded-bits
+ // optimization that stems from the users.
+ if (Op->use_size() > 2)
+ return false;
+ return all_of(Op->uses(),
+ [&](const SDUse &Use) { return canIgnoreSignBitOfZero(Use); });
+}
+
bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
// Check the obvious case.
if (A == B) return true;
diff --git a/llvm/test/CodeGen/AArch64/ignore-signed-zero.ll b/llvm/test/CodeGen/AArch64/ignore-signed-zero.ll
new file mode 100644
index 0000000000000..3b17e410ac380
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ignore-signed-zero.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+
+; Test that nsz constraint can be bypassed when all uses are sign-insensitive.
+
+define i1 @test_fadd_neg_zero_fcmp(float %x) {
+; CHECK-LABEL: test_fadd_neg_zero_fcmp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s1, #1.00000000
+; CHECK-NEXT: fcmp s0, s1
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+ %add = fadd float %x, -0.0
+ %cmp = fcmp oeq float %add, 1.0
+ ret i1 %cmp
+}
+
+define float @test_fsub_zero_fabs(float %x) {
+; CHECK-LABEL: test_fsub_zero_fabs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fabs s0, s0
+; CHECK-NEXT: ret
+ %sub = fsub float %x, 0.0
+ %abs = call float @llvm.fabs.f32(float %sub)
+ ret float %abs
+}
+
+define float @test_fsub_neg_zero_copysign(float %x, float %y) {
+; CHECK-LABEL: test_fsub_neg_zero_copysign:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mvni v2.4s, #128, lsl #24
+; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT: ret
+ %sub = fsub float -0.0, %x
+ %copysign = call float @llvm.copysign.f32(float %sub, float %y)
+ ret float %copysign
+}
+
+define i1 @test_div_sqrt_fcmp(float %x) {
+; CHECK-LABEL: test_div_sqrt_fcmp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fsqrt s0, s0
+; CHECK-NEXT: fcmp s0, #0.0
+; CHECK-NEXT: cset w0, gt
+; CHECK-NEXT: ret
+ %sqrt = call float @llvm.sqrt.f32(float %x)
+ %div = fdiv reassoc float %x, %sqrt
+ %cmp = fcmp ogt float %div, 0.0
+ ret i1 %cmp
+}
+
+define float @test_frem_fabs(float %x) {
+; CHECK-LABEL: test_frem_fabs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmov s1, #0.50000000
+; CHECK-NEXT: fmov s2, #-2.00000000
+; CHECK-NEXT: fmul s1, s0, s1
+; CHECK-NEXT: frintz s1, s1
+; CHECK-NEXT: fmadd s0, s1, s2, s0
+; CHECK-NEXT: fabs s0, s0
+; CHECK-NEXT: ret
+ %rem = frem float %x, 2.0
+ %abs = call float @llvm.fabs.f32(float %rem)
+ ret float %abs
+}
+
+declare float @llvm.fabs.f32(float)
+declare float @llvm.copysign.f32(float, float)
+declare float @llvm.sqrt.f32(float)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 05d3e9c381910..1b8ff6b688c19 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -371,7 +371,7 @@ define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(ptr addrspace(1
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
%load = load float, ptr addrspace(1) %gep, align 4
- %v0 = fadd float %load, 0.0
+ %v0 = fadd float %load, 1.0
%v = tail call float @llvm.fabs.f32(float %v0)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, ptr addrspace(1) %gep, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
index d4a8a0d762afd..1130c465c15e3 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
@@ -28,14 +28,13 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_mov_b32 s7, 0x40140000
-; CHECK-NEXT: s_add_i32 s1, s1, s0
-; CHECK-NEXT: s_cmpk_lt_i32 s1, 0xa00
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_mov_b32 s7, 0x40180000
+; CHECK-NEXT: s_add_i32 s1, s1, s0
+; CHECK-NEXT: s_cmpk_lt_i32 s1, 0xa00
; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7]
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_mov_b32 s7, 0x401c0000
More information about the llvm-commits
mailing list