[llvm-branch-commits] [llvm] DAG: Fold copysign with a known signmask to a disjoint or (PR #167266)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Nov 10 08:11:08 PST 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/167266
>From d6b054dc67ba829009cb588faa5c868ea13121b9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 1 Aug 2024 15:11:59 +0400
Subject: [PATCH 1/3] DAG: Fold copysign with a known signmask to a disjoint or
If the sign bit is a computed sign mask (i.e., we know it's
either +0 or -0), turn this into a disjoint or. This pattern
appears in the pow implementations.
We also need to know the sign bit of the magnitude is 0 for
the or to be disjoint. Unfortunately the DAG's FP tracking is
weak and we did not have a way to check if the sign bit is known
0, so add something for that. Ideally we would get a complete
computeKnownFPClass implementation.
This is intended to help avoid the regression which caused
d3e7c4ce7a3d7 to be reverted.
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 4 +++
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 20 +++++++++++++
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 28 +++++++++++++++++++
.../AMDGPU/copysign-simplify-demanded-bits.ll | 23 ++++++---------
.../AMDGPU/copysign-to-disjoint-or-combine.ll | 22 ++++-----------
5 files changed, 67 insertions(+), 30 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 0dd4f23c6d85f..5b331e4444915 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -2072,6 +2072,10 @@ class SelectionDAG {
/// We use this predicate to simplify operations downstream.
LLVM_ABI bool SignBitIsZero(SDValue Op, unsigned Depth = 0) const;
+ /// Return true if the sign bit of Op is known to be zero, for a
+ /// floating-point value.
+ LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth = 0) const;
+
/// Return true if 'Op & Mask' is known to be zero. We
/// use this predicate to simplify operations downstream. Op and Mask are
/// known to be the same type.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f144f17d5a8f2..4f2eb1e64dbe0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18863,6 +18863,26 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
+ if (VT != N1.getValueType())
+ return SDValue();
+
+ // If this is equivalent to a disjoint or, replace it with one. This can
+ // happen if the sign operand is a sign mask (i.e., x << sign_bit_position).
+ if (DAG.SignBitIsZeroFP(N0) &&
+ DAG.computeKnownBits(N1).Zero.isMaxSignedValue()) {
+ // TODO: Just directly match the shift pattern. computeKnownBits is heavy
+ // for a such a narrowly targeted case.
+ EVT IntVT = VT.changeTypeToInteger();
+ // TODO: It appears to be profitable in some situations to unconditionally
+ // emit a fabs(n0) to perform this combine.
+ SDValue CastSrc0 = DAG.getNode(ISD::BITCAST, DL, IntVT, N0);
+ SDValue CastSrc1 = DAG.getNode(ISD::BITCAST, DL, IntVT, N1);
+
+ SDValue SignOr = DAG.getNode(ISD::OR, DL, IntVT, CastSrc0, CastSrc1,
+ SDNodeFlags::Disjoint);
+ return DAG.getNode(ISD::BITCAST, DL, VT, SignOr);
+ }
+
return SDValue();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 80bbfea7fb83c..d7b3b03deb757 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2920,6 +2920,34 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
return MaskedValueIsZero(Op, APInt::getSignMask(BitWidth), Depth);
}
+bool SelectionDAG::SignBitIsZeroFP(SDValue Op, unsigned Depth) const {
+ if (Depth >= MaxRecursionDepth)
+ return false; // Limit search depth.
+
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case ISD::FABS:
+ return true;
+ case ISD::AssertNoFPClass: {
+ FPClassTest NoFPClass =
+ static_cast<FPClassTest>(Op.getConstantOperandVal(1));
+
+ const FPClassTest TestMask = fcNan | fcNegative;
+ return (NoFPClass & TestMask) == TestMask;
+ }
+ case ISD::ARITH_FENCE:
+ return SignBitIsZeroFP(Op, Depth + 1);
+ case ISD::FEXP:
+ case ISD::FEXP2:
+ case ISD::FEXP10:
+ return Op->getFlags().hasNoNaNs();
+ default:
+ return false;
+ }
+
+ llvm_unreachable("covered opcode switch");
+}
+
/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero. We use
/// this predicate to simplify operations downstream. Mask is known to be zero
/// for bits that V cannot have.
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index 0be2b5c70c93b..ef676ddc8070e 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -345,15 +345,13 @@ define float @test_copysign_pow_fast_f32__integral_y(float %x, i32 %y.i) {
; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; GFX9-NEXT: v_fma_f32 v2, v2, v1, v3
-; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT: v_exp_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1
; GFX9-NEXT: v_not_b32_e32 v3, 63
; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX9-NEXT: v_ldexp_f32 v2, v2, v3
-; GFX9-NEXT: v_and_b32_e32 v0, v1, v0
-; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
+; GFX9-NEXT: v_and_or_b32 v0, v1, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to float
%y.fptosi = fptosi float %y to i32
@@ -379,7 +377,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-NEXT: v_writelane_b32 v43, s16, 15
+; GFX9-NEXT: v_writelane_b32 v43, s16, 14
; GFX9-NEXT: v_writelane_b32 v43, s30, 0
; GFX9-NEXT: v_writelane_b32 v43, s31, 1
; GFX9-NEXT: v_writelane_b32 v43, s34, 2
@@ -391,19 +389,18 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: v_writelane_b32 v43, s48, 8
; GFX9-NEXT: v_writelane_b32 v43, s49, 9
; GFX9-NEXT: v_writelane_b32 v43, s50, 10
-; GFX9-NEXT: v_writelane_b32 v43, s51, 11
; GFX9-NEXT: s_addk_i32 s32, 0x800
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: v_writelane_b32 v43, s52, 12
+; GFX9-NEXT: v_writelane_b32 v43, s51, 11
; GFX9-NEXT: v_mov_b32_e32 v42, v1
-; GFX9-NEXT: v_writelane_b32 v43, s53, 13
+; GFX9-NEXT: v_writelane_b32 v43, s52, 12
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
; GFX9-NEXT: s_getpc_b64 s[16:17]
; GFX9-NEXT: s_add_u32 s16, s16, _Z4log2d at rel32@lo+4
; GFX9-NEXT: s_addc_u32 s17, s17, _Z4log2d at rel32@hi+12
-; GFX9-NEXT: v_writelane_b32 v43, s54, 14
+; GFX9-NEXT: v_writelane_b32 v43, s53, 13
; GFX9-NEXT: v_mov_b32_e32 v40, v31
; GFX9-NEXT: v_mov_b32_e32 v41, v2
; GFX9-NEXT: s_mov_b32 s50, s15
@@ -414,7 +411,6 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: s_mov_b64 s[36:37], s[8:9]
; GFX9-NEXT: s_mov_b64 s[38:39], s[6:7]
; GFX9-NEXT: s_mov_b64 s[48:49], s[4:5]
-; GFX9-NEXT: s_brev_b32 s54, -2
; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT: v_cvt_f64_i32_e32 v[2:3], v41
; GFX9-NEXT: s_getpc_b64 s[16:17]
@@ -436,8 +432,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT: v_bfi_b32 v1, s54, v1, v2
-; GFX9-NEXT: v_readlane_b32 s54, v43, 14
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-NEXT: v_readlane_b32 s53, v43, 13
; GFX9-NEXT: v_readlane_b32 s52, v43, 12
; GFX9-NEXT: v_readlane_b32 s51, v43, 11
@@ -453,7 +448,7 @@ define double @test_pow_fast_f64integral_y(double %x, i32 %y.i) #0 {
; GFX9-NEXT: v_readlane_b32 s31, v43, 1
; GFX9-NEXT: v_readlane_b32 s30, v43, 0
; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v43, 15
+; GFX9-NEXT: v_readlane_b32 s4, v43, 14
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll
index b99b64316b62f..afd610f4911c6 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-to-disjoint-or-combine.ll
@@ -81,8 +81,7 @@ define half @copysign_known_signmask_f16_known_positive_mag(half nofpclass(nan n
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 15, v1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signmask = shl i16 %sign, 15
%signmask.bitcast = bitcast i16 %signmask to half
@@ -94,9 +93,7 @@ define float @copysign_known_signmask_f32_known_positive_mag(float nofpclass(nan
; GFX9-LABEL: copysign_known_signmask_f32_known_positive_mag:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
-; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signmask = shl i32 %sign, 31
%signmask.bitcast = bitcast i32 %signmask to float
@@ -109,8 +106,7 @@ define double @copysign_known_signmask_f64_known_positive_mag(double nofpclass(n
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 31, v2
-; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
+; GFX9-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signmask = shl i64 %sign, 63
%signmask.bitcast = bitcast i64 %signmask to double
@@ -130,11 +126,9 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp(float %x,
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
; GFX9-NEXT: v_exp_f32_e32 v0, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
-; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_mul_f32_e32 v2, 0x114b4ea4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signbit.known.zero = call nnan afn float @llvm.exp.f32(float %x)
%signmask = shl i32 %sign, 31
@@ -155,10 +149,8 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp2(float %x
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_not_b32_e32 v2, 63
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x)
%signmask = shl i32 %sign, 31
@@ -179,10 +171,8 @@ define float @copysign_known_signmask_f32_known_positive_mag__nnan_exp10(float %
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_not_b32_e32 v2, 63
; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; GFX9-NEXT: v_ldexp_f32 v0, v0, v2
-; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GFX9-NEXT: v_lshl_or_b32 v0, v1, 31, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%signbit.known.zero = call nnan afn float @llvm.exp2.f32(float %x)
%signmask = shl i32 %sign, 31
>From 36287b03c05043a5de5949537fc1b9f7272692f7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sun, 9 Nov 2025 16:36:33 -0800
Subject: [PATCH 2/3] Match shift to signbit pattern instead of
computeKnownBits
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4f2eb1e64dbe0..ef554ff48f849 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18866,13 +18866,19 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
if (VT != N1.getValueType())
return SDValue();
+ if (!APFloat::hasSignBitInMSB(VT.getFltSemantics()))
+ return SDValue();
+
+ unsigned SignBit = VT.getScalarSizeInBits() - 1;
+
// If this is equivalent to a disjoint or, replace it with one. This can
// happen if the sign operand is a sign mask (i.e., x << sign_bit_position).
- if (DAG.SignBitIsZeroFP(N0) &&
- DAG.computeKnownBits(N1).Zero.isMaxSignedValue()) {
- // TODO: Just directly match the shift pattern. computeKnownBits is heavy
- // for a such a narrowly targeted case.
+ if (sd_match(N1, m_BitCast(m_Shl(m_Value(), m_SpecificInt(SignBit)))) &&
+ DAG.SignBitIsZeroFP(N0)) {
+ // This could equivalently be computeKnownBits(N1).Zero.isMaxSignedValue(),
+ // but computeKnownBits is heavy for a such a narrowly targeted case.
EVT IntVT = VT.changeTypeToInteger();
+
// TODO: It appears to be profitable in some situations to unconditionally
// emit a fabs(n0) to perform this combine.
SDValue CastSrc0 = DAG.getNode(ISD::BITCAST, DL, IntVT, N0);
>From ed01dc96d2a177cbb77db44c6895071141c79535 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sun, 9 Nov 2025 21:15:49 -0800
Subject: [PATCH 3/3] Revert "Match shift to signbit pattern instead of
computeKnownBits"
This reverts commit 49e2c3aa7a861fc8864c2d045b3804e31e1f13cc.
One case is slighly more sophisticated
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 ++++----------
1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ef554ff48f849..4f2eb1e64dbe0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18866,19 +18866,13 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
if (VT != N1.getValueType())
return SDValue();
- if (!APFloat::hasSignBitInMSB(VT.getFltSemantics()))
- return SDValue();
-
- unsigned SignBit = VT.getScalarSizeInBits() - 1;
-
// If this is equivalent to a disjoint or, replace it with one. This can
// happen if the sign operand is a sign mask (i.e., x << sign_bit_position).
- if (sd_match(N1, m_BitCast(m_Shl(m_Value(), m_SpecificInt(SignBit)))) &&
- DAG.SignBitIsZeroFP(N0)) {
- // This could equivalently be computeKnownBits(N1).Zero.isMaxSignedValue(),
- // but computeKnownBits is heavy for a such a narrowly targeted case.
+ if (DAG.SignBitIsZeroFP(N0) &&
+ DAG.computeKnownBits(N1).Zero.isMaxSignedValue()) {
+ // TODO: Just directly match the shift pattern. computeKnownBits is heavy
+ // for a such a narrowly targeted case.
EVT IntVT = VT.changeTypeToInteger();
-
// TODO: It appears to be profitable in some situations to unconditionally
// emit a fabs(n0) to perform this combine.
SDValue CastSrc0 = DAG.getNode(ISD::BITCAST, DL, IntVT, N0);
More information about the llvm-branch-commits
mailing list