[llvm-branch-commits] [llvm] [DAGCombiner] Relax nsz constraint with fp->int->fp optimizations (PR #164503)
Guy David via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Oct 24 08:59:09 PDT 2025
https://github.com/guy-david updated https://github.com/llvm/llvm-project/pull/164503
>From 7f65dea126ac725b2f7cde88784845a7eb518de5 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96 at gmail.com>
Date: Wed, 22 Oct 2025 00:07:57 +0300
Subject: [PATCH] [DAGCombiner] Relax nsz constraint with fp->int->fp
optimizations
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 4 +
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 20 ++-
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 29 ++++
llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll | 104 +++++++++++++
.../AMDGPU/select-fabs-fneg-extract.f16.ll | 64 +++-----
.../AMDGPU/select-fabs-fneg-extract.v2f16.ll | 50 +++---
llvm/test/CodeGen/X86/setoeq.ll | 142 +++---------------
7 files changed, 208 insertions(+), 205 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index df6ce0fe1b037..a4ab3ef1de30c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -2322,6 +2322,10 @@ class SelectionDAG {
/// +nan are considered positive, -0.0, -inf and -nan are not.
LLVM_ABI bool cannotBeOrderedNegativeFP(SDValue Op) const;
+ /// Check if all uses of a floating-point value are insensitive to signed
+ /// zeros.
+ LLVM_ABI bool allUsesSignedZeroInsensitive(SDValue Op) const;
+
/// Test whether two SDValues are known to compare equal. This
/// is true if they are the same value, or if one is negative zero and the
/// other positive zero.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2372d7dfe7c3c..73aed33fe0838 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18891,12 +18891,13 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP;
assert(IsSigned || IsUnsigned);
- bool IsSignedZeroSafe = DAG.getTarget().Options.NoSignedZerosFPMath;
+ bool IsSignedZeroSafe = DAG.getTarget().Options.NoSignedZerosFPMath ||
+ DAG.allUsesSignedZeroInsensitive(SDValue(N, 0));
// For signed conversions: The optimization changes signed zero behavior.
if (IsSigned && !IsSignedZeroSafe)
return SDValue();
// For unsigned conversions, we need FABS to canonicalize -0.0 to +0.0
- // (unless NoSignedZerosFPMath is set).
+ // (unless outputting a signed zero is OK).
if (IsUnsigned && !IsSignedZeroSafe && !TLI.isFAbsFree(VT))
return SDValue();
@@ -19375,10 +19376,17 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
// FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't
// know it was called from a context with a nsz flag if the input fsub does
// not.
- if (N0.getOpcode() == ISD::FSUB && N->getFlags().hasNoSignedZeros() &&
- N0.hasOneUse()) {
- return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1),
- N0.getOperand(0));
+ if (N0.getOpcode() == ISD::FSUB && N0.hasOneUse()) {
+ SDValue X = N0.getOperand(0);
+ SDValue Y = N0.getOperand(1);
+
+ // Safe if NoSignedZeros, or if we can prove X != Y (avoiding the -0.0 vs
+ // +0.0 issue) For now, we use a conservative check: if either operand is
+ // known never zero, then X - Y can't produce a signed zero from X == Y.
+ if (N->getFlags().hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(X) ||
+ DAG.isKnownNeverZeroFloat(Y)) {
+ return DAG.getNode(ISD::FSUB, SDLoc(N), VT, Y, X);
+ }
}
if (SimplifyDemandedBits(SDValue(N, 0)))
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 379242ec5a157..61b70ffd26e2f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6075,6 +6075,35 @@ bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) const {
Op, [](ConstantFPSDNode *C) { return !C->isZero(); });
}
+bool SelectionDAG::allUsesSignedZeroInsensitive(SDValue Op) const {
+ assert(Op.getValueType().isFloatingPoint());
+ return all_of(Op->uses(), [&](SDUse &Use) {
+ SDNode *User = Use.getUser();
+ unsigned OperandNo = Use.getOperandNo();
+
+ // Check if this use is insensitive to the sign of zero
+ switch (User->getOpcode()) {
+ case ISD::SETCC:
+ // Comparisons: IEEE-754 specifies +0.0 == -0.0.
+ case ISD::FABS:
+ // fabs always produces +0.0.
+ return true;
+ case ISD::FCOPYSIGN:
+ // copysign overwrites the sign bit of the first operand.
+ return OperandNo == 0;
+ case ISD::FADD:
+ case ISD::FSUB: {
+ // Arithmetic with non-zero constants fixes the uncertainty around the
+ // sign bit.
+ SDValue Other = User->getOperand(1 - OperandNo);
+ return isKnownNeverZeroFloat(Other);
+ }
+ default:
+ return false;
+ }
+ });
+}
+
bool SelectionDAG::isKnownNeverZero(SDValue Op, unsigned Depth) const {
if (Depth >= MaxRecursionDepth)
return false; // Limit search depth.
diff --git a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
index 9a8c555953611..cac155e256572 100644
--- a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
@@ -134,7 +134,111 @@ entry:
ret float %f
}
+define i1 @test_fcmp(float %x) {
+; CHECK-LABEL: test_fcmp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintz s0, s0
+; CHECK-NEXT: fcmp s0, #0.0
+; CHECK-NEXT: cset w0, eq
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_fcmp:
+; NO-SIGNED-ZEROS: // %bb.0:
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: fcmp s0, #0.0
+; NO-SIGNED-ZEROS-NEXT: cset w0, eq
+; NO-SIGNED-ZEROS-NEXT: ret
+ %conv1 = fptosi float %x to i32
+ %conv2 = sitofp i32 %conv1 to float
+ %cmp = fcmp oeq float %conv2, 0.0
+ ret i1 %cmp
+}
+
+define float @test_fabs(float %x) {
+; CHECK-LABEL: test_fabs:
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintz s0, s0
+; CHECK-NEXT: fabs s0, s0
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_fabs:
+; NO-SIGNED-ZEROS: // %bb.0:
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: fabs s0, s0
+; NO-SIGNED-ZEROS-NEXT: ret
+ %conv1 = fptosi float %x to i32
+ %conv2 = sitofp i32 %conv1 to float
+ %abs = call float @llvm.fabs.f32(float %conv2)
+ ret float %abs
+}
+
+define float @test_copysign(float %x, float %y) {
+; CHECK-LABEL: test_copysign:
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintz s0, s0
+; CHECK-NEXT: mvni v2.4s, #128, lsl #24
+; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_copysign:
+; NO-SIGNED-ZEROS: // %bb.0:
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: mvni v2.4s, #128, lsl #24
+; NO-SIGNED-ZEROS-NEXT: // kill: def $s1 killed $s1 def $q1
+; NO-SIGNED-ZEROS-NEXT: bif v0.16b, v1.16b, v2.16b
+; NO-SIGNED-ZEROS-NEXT: // kill: def $s0 killed $s0 killed $q0
+; NO-SIGNED-ZEROS-NEXT: ret
+ %conv1 = fptosi float %x to i32
+ %conv2 = sitofp i32 %conv1 to float
+ %combine = call float @llvm.copysign.f32(float %conv2, float %y)
+ ret float %combine
+}
+
+define float @test_fadd(float %x) {
+; CHECK-LABEL: test_fadd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintz s0, s0
+; CHECK-NEXT: fmov s1, #1.00000000
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_fadd:
+; NO-SIGNED-ZEROS: // %bb.0:
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: fmov s1, #1.00000000
+; NO-SIGNED-ZEROS-NEXT: fadd s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT: ret
+ %conv1 = fptosi float %x to i32
+ %conv2 = sitofp i32 %conv1 to float
+ %add = fadd float %conv2, 1.0
+ ret float %add
+}
+
+define float @test_fsub(float %x) {
+; CHECK-LABEL: test_fsub:
+; CHECK: // %bb.0:
+; CHECK-NEXT: frintz s0, s0
+; CHECK-NEXT: fmov s1, #-1.00000000
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_fsub:
+; NO-SIGNED-ZEROS: // %bb.0:
+; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT: fmov s1, #-1.00000000
+; NO-SIGNED-ZEROS-NEXT: fadd s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT: ret
+ %conv1 = fptosi float %x to i32
+ %conv2 = sitofp i32 %conv1 to float
+ %sub = fsub float %conv2, 1.0
+ ret float %sub
+}
+
declare i32 @llvm.smin.i32(i32, i32)
declare i32 @llvm.smax.i32(i32, i32)
declare i32 @llvm.umin.i32(i32, i32)
declare i32 @llvm.umax.i32(i32, i32)
+declare float @llvm.fabs.f32(float)
+declare float @llvm.copysign.f32(float, float)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
index 1222d0efd62bb..e6ec28b8ed5e8 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
@@ -2615,65 +2615,43 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) {
}
define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) {
-; CI-SAFE-LABEL: select_fneg_posk_src_sub_f16:
-; CI-SAFE: ; %bb.0:
-; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-SAFE-NEXT: v_add_f32_e32 v1, -4.0, v1
-; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc
-; CI-SAFE-NEXT: s_setpc_b64 s[30:31]
+; CI-LABEL: select_fneg_posk_src_sub_f16:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT: v_sub_f32_e32 v1, 4.0, v1
+; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc
+; CI-NEXT: s_setpc_b64 s[30:31]
;
-; VI-SAFE-LABEL: select_fneg_posk_src_sub_f16:
-; VI-SAFE: ; %bb.0:
-; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SAFE-NEXT: v_add_f16_e32 v1, -4.0, v1
-; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; VI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000
-; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
+; VI-LABEL: select_fneg_posk_src_sub_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_sub_f16_e32 v1, 4.0, v1
+; VI-NEXT: v_mov_b32_e32 v2, 0x4000
+; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_sub_f16:
; GFX11-SAFE-TRUE16: ; %bb.0:
; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v1.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
+; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, 4.0, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_sub_f16:
; GFX11-SAFE-FAKE16: ; %bb.0:
; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v1
+; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v1, 4.0, v1
; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; CI-NSZ-LABEL: select_fneg_posk_src_sub_f16:
-; CI-NSZ: ; %bb.0:
-; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-NSZ-NEXT: v_sub_f32_e32 v1, 4.0, v1
-; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc
-; CI-NSZ-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-NSZ-LABEL: select_fneg_posk_src_sub_f16:
-; VI-NSZ: ; %bb.0:
-; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NSZ-NEXT: v_sub_f16_e32 v1, 4.0, v1
-; VI-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000
-; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_sub_f16:
; GFX11-NSZ-TRUE16: ; %bb.0:
; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index 9814ed80befbf..f654c8a855394 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -3277,39 +3277,29 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
; CI-LABEL: select_fneg_posk_src_sub_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT: v_add_f32_e32 v3, -4.0, v3
-; CI-NEXT: v_add_f32_e32 v2, -4.0, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; CI-NEXT: v_or_b32_e32 v2, v2, v3
-; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; CI-NEXT: v_cvt_f32_f16_e32 v3, v2
-; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_sub_f32_e32 v2, 4.0, v2
+; CI-NEXT: v_sub_f32_e32 v3, 4.0, v3
+; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc
+; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: select_fneg_posk_src_sub_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; VI-NEXT: v_mov_b32_e32 v1, 0xc400
-; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_f16_e32 v2, -4.0, v2
-; VI-NEXT: v_or_b32_e32 v1, v2, v1
-; VI-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
-; VI-NEXT: v_mov_b32_e32 v2, 0x4000
+; VI-NEXT: v_mov_b32_e32 v1, 0x4400
+; VI-NEXT: v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_sub_f16_e32 v2, 4.0, v2
+; VI-NEXT: v_mov_b32_e32 v3, 0x4000
; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
-; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
-; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
+; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3317,8 +3307,7 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT: v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0]
-; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX9-NEXT: v_pk_add_f16 v1, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5]
@@ -3330,28 +3319,25 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) {
; GFX11-TRUE16-LABEL: select_fneg_posk_src_sub_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: select_fneg_posk_src_sub_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq <2 x i32> %c, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/setoeq.ll b/llvm/test/CodeGen/X86/setoeq.ll
index 131e279aa645c..8aebf8eaa62e7 100644
--- a/llvm/test/CodeGen/X86/setoeq.ll
+++ b/llvm/test/CodeGen/X86/setoeq.ll
@@ -18,8 +18,7 @@ define zeroext i8 @oeq_f64_i32(double %x) nounwind readnone {
; AVX-LABEL: oeq_f64_i32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm1
-; AVX-NEXT: vcvtdq2pd %xmm1, %xmm1
+; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX-NEXT: vcmpeqsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: andl $1, %eax
@@ -29,8 +28,7 @@ define zeroext i8 @oeq_f64_i32(double %x) nounwind readnone {
; AVX512-LABEL: oeq_f64_i32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm1
-; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm1
+; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX512-NEXT: vcmpeqsd %xmm0, %xmm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
@@ -67,16 +65,7 @@ define zeroext i8 @oeq_f64_u32(double %x) nounwind readnone {
; AVX-LABEL: oeq_f64_u32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vcvttsd2si %xmm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: sarl $31, %ecx
-; AVX-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
-; AVX-NEXT: vcvttsd2si %xmm1, %edx
-; AVX-NEXT: andl %ecx, %edx
-; AVX-NEXT: orl %eax, %edx
-; AVX-NEXT: vmovd %edx, %xmm1
-; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
-; AVX-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX-NEXT: vcmpeqsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: andl $1, %eax
@@ -86,8 +75,7 @@ define zeroext i8 @oeq_f64_u32(double %x) nounwind readnone {
; AVX512-LABEL: oeq_f64_u32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vcvttsd2usi %xmm0, %eax
-; AVX512-NEXT: vcvtusi2sd %eax, %xmm7, %xmm1
+; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX512-NEXT: vcmpeqsd %xmm0, %xmm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
@@ -131,35 +119,21 @@ define zeroext i8 @oeq_f64_i64(double %x) nounwind readnone {
;
; AVX-LABEL: oeq_f64_i64:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: pushl %ebp
-; AVX-NEXT: movl %esp, %ebp
-; AVX-NEXT: andl $-8, %esp
-; AVX-NEXT: subl $24, %esp
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovsd %xmm0, (%esp)
-; AVX-NEXT: fldl (%esp)
-; AVX-NEXT: fisttpll (%esp)
-; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-NEXT: fstpl {{[0-9]+}}(%esp)
-; AVX-NEXT: vcmpeqsd {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
+; AVX-NEXT: vcmpeqsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: andl $1, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: movl %ebp, %esp
-; AVX-NEXT: popl %ebp
; AVX-NEXT: retl
;
; AVX512-LABEL: oeq_f64_i64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm1
-; AVX512-NEXT: vcvtqq2pd %ymm1, %ymm1
+; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX512-NEXT: vcmpeqsd %xmm0, %xmm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retl
entry:
%0 = fptosi double %x to i64
@@ -216,48 +190,21 @@ define zeroext i8 @oeq_f64_u64(double %x) nounwind readnone {
;
; AVX-LABEL: oeq_f64_u64:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: pushl %ebp
-; AVX-NEXT: movl %esp, %ebp
-; AVX-NEXT: andl $-8, %esp
-; AVX-NEXT: subl $8, %esp
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
-; AVX-NEXT: vucomisd %xmm0, %xmm1
-; AVX-NEXT: jbe .LBB3_2
-; AVX-NEXT: # %bb.1: # %entry
-; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: .LBB3_2: # %entry
-; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vmovsd %xmm1, (%esp)
-; AVX-NEXT: fldl (%esp)
-; AVX-NEXT: fisttpll (%esp)
-; AVX-NEXT: setbe %al
-; AVX-NEXT: movzbl %al, %eax
-; AVX-NEXT: shll $31, %eax
-; AVX-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX-NEXT: vcmpeqsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: andl $1, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: movl %ebp, %esp
-; AVX-NEXT: popl %ebp
; AVX-NEXT: retl
;
; AVX512-LABEL: oeq_f64_u64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vcvttpd2uqq %xmm0, %xmm1
-; AVX512-NEXT: vcvtuqq2pd %ymm1, %ymm1
+; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX512-NEXT: vcmpeqsd %xmm0, %xmm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retl
entry:
%0 = fptoui double %x to i64
@@ -282,8 +229,7 @@ define zeroext i8 @une_f64_i32(double %x) nounwind readnone {
; AVX-LABEL: une_f64_i32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm1
-; AVX-NEXT: vcvtdq2pd %xmm1, %xmm1
+; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX-NEXT: vcmpneqsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: andl $1, %eax
@@ -293,8 +239,7 @@ define zeroext i8 @une_f64_i32(double %x) nounwind readnone {
; AVX512-LABEL: une_f64_i32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm1
-; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm1
+; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX512-NEXT: vcmpneqsd %xmm0, %xmm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
@@ -331,16 +276,7 @@ define zeroext i8 @une_f64_u32(double %x) nounwind readnone {
; AVX-LABEL: une_f64_u32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vcvttsd2si %xmm0, %eax
-; AVX-NEXT: movl %eax, %ecx
-; AVX-NEXT: sarl $31, %ecx
-; AVX-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
-; AVX-NEXT: vcvttsd2si %xmm1, %edx
-; AVX-NEXT: andl %ecx, %edx
-; AVX-NEXT: orl %eax, %edx
-; AVX-NEXT: vmovd %edx, %xmm1
-; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
-; AVX-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
+; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX-NEXT: vcmpneqsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: andl $1, %eax
@@ -350,8 +286,7 @@ define zeroext i8 @une_f64_u32(double %x) nounwind readnone {
; AVX512-LABEL: une_f64_u32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vcvttsd2usi %xmm0, %eax
-; AVX512-NEXT: vcvtusi2sd %eax, %xmm7, %xmm1
+; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX512-NEXT: vcmpneqsd %xmm0, %xmm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
@@ -395,35 +330,21 @@ define zeroext i8 @une_f64_i64(double %x) nounwind readnone {
;
; AVX-LABEL: une_f64_i64:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: pushl %ebp
-; AVX-NEXT: movl %esp, %ebp
-; AVX-NEXT: andl $-8, %esp
-; AVX-NEXT: subl $24, %esp
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovsd %xmm0, (%esp)
-; AVX-NEXT: fldl (%esp)
-; AVX-NEXT: fisttpll (%esp)
-; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-NEXT: fstpl {{[0-9]+}}(%esp)
-; AVX-NEXT: vcmpneqsd {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
+; AVX-NEXT: vcmpneqsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: andl $1, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: movl %ebp, %esp
-; AVX-NEXT: popl %ebp
; AVX-NEXT: retl
;
; AVX512-LABEL: une_f64_i64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm1
-; AVX512-NEXT: vcvtqq2pd %ymm1, %ymm1
+; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX512-NEXT: vcmpneqsd %xmm0, %xmm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retl
entry:
%0 = fptosi double %x to i64
@@ -480,48 +401,21 @@ define zeroext i8 @une_f64_u64(double %x) nounwind readnone {
;
; AVX-LABEL: une_f64_u64:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: pushl %ebp
-; AVX-NEXT: movl %esp, %ebp
-; AVX-NEXT: andl $-8, %esp
-; AVX-NEXT: subl $8, %esp
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
-; AVX-NEXT: vucomisd %xmm0, %xmm1
-; AVX-NEXT: jbe .LBB7_2
-; AVX-NEXT: # %bb.1: # %entry
-; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: .LBB7_2: # %entry
-; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vmovsd %xmm1, (%esp)
-; AVX-NEXT: fldl (%esp)
-; AVX-NEXT: fisttpll (%esp)
-; AVX-NEXT: setbe %al
-; AVX-NEXT: movzbl %al, %eax
-; AVX-NEXT: shll $31, %eax
-; AVX-NEXT: xorl {{[0-9]+}}(%esp), %eax
-; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX-NEXT: vcmpneqsd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: andl $1, %eax
; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: movl %ebp, %esp
-; AVX-NEXT: popl %ebp
; AVX-NEXT: retl
;
; AVX512-LABEL: une_f64_u64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vcvttpd2uqq %xmm0, %xmm1
-; AVX512-NEXT: vcvtuqq2pd %ymm1, %ymm1
+; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1
; AVX512-NEXT: vcmpneqsd %xmm0, %xmm1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retl
entry:
%0 = fptoui double %x to i64
More information about the llvm-branch-commits
mailing list