[llvm] [SelectionDAG] Handle `fneg`/`fabs`/`fcopysign` in `SimplifyDemandedBits` (PR #139239)
Iris Shi via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 11 06:53:16 PDT 2025
https://github.com/el-ev updated https://github.com/llvm/llvm-project/pull/139239
>From 82336e5c43ed87eba02cd4a035233b33f176eb9e Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0 at owo.li>
Date: Fri, 9 May 2025 18:01:25 +0800
Subject: [PATCH 1/4] [SelectionDAG] Handle `fneg`/`fabs`/`fcopysign` in
`SimplifyDemandedBits`
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 29 +---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 71 +++++++++
.../CodeGen/AArch64/extract-vector-elt.ll | 5 +-
llvm/test/CodeGen/AMDGPU/bf16-conversions.ll | 38 ++---
llvm/test/CodeGen/AMDGPU/bf16.ll | 36 ++---
llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 142 +++++++++---------
llvm/test/CodeGen/AMDGPU/rcp-pattern.ll | 14 +-
llvm/test/CodeGen/AMDGPU/roundeven.ll | 49 +++---
.../AMDGPU/select-fabs-fneg-extract.v2f16.ll | 20 +--
9 files changed, 228 insertions(+), 176 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e79a17e86bc87..39c883148ad02 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18413,21 +18413,6 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
}
}
- // copysign(fabs(x), y) -> copysign(x, y)
- // copysign(fneg(x), y) -> copysign(x, y)
- // copysign(copysign(x,z), y) -> copysign(x, y)
- if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
- N0.getOpcode() == ISD::FCOPYSIGN)
- return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0.getOperand(0), N1);
-
- // copysign(x, abs(y)) -> abs(x)
- if (N1.getOpcode() == ISD::FABS)
- return DAG.getNode(ISD::FABS, DL, VT, N0);
-
- // copysign(x, copysign(y,z)) -> copysign(x, z)
- if (N1.getOpcode() == ISD::FCOPYSIGN)
- return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(1));
-
// copysign(x, fp_extend(y)) -> copysign(x, y)
// copysign(x, fp_round(y)) -> copysign(x, y)
if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
@@ -18968,6 +18953,9 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
N0.getOperand(0));
}
+ if (SimplifyDemandedBits(N0, APInt::getAllOnes(VT.getScalarSizeInBits())))
+ return SDValue(N, 0);
+
if (SDValue Cast = foldSignChangeInBitcast(N))
return Cast;
@@ -19041,14 +19029,9 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
if (SDValue C = DAG.FoldConstantArithmetic(ISD::FABS, DL, VT, {N0}))
return C;
- // fold (fabs (fabs x)) -> (fabs x)
- if (N0.getOpcode() == ISD::FABS)
- return N->getOperand(0);
-
- // fold (fabs (fneg x)) -> (fabs x)
- // fold (fabs (fcopysign x, y)) -> (fabs x)
- if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
- return DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
+ if (SimplifyDemandedBits(N0,
+ APInt::getSignedMaxValue(VT.getScalarSizeInBits())))
+ return SDValue(N, 0);
if (SDValue Cast = foldSignChangeInBitcast(N))
return Cast;
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a0ffb4b6d5a4c..971ed5d97d523 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2966,6 +2966,77 @@ bool TargetLowering::SimplifyDemandedBits(
}
break;
}
+ case ISD::FABS: {
+ SDValue Op0 = Op.getOperand(0);
+ APInt SignMask = APInt::getSignMask(BitWidth);
+
+ if (!DemandedBits.intersects(SignMask))
+ return TLO.CombineTo(Op, Op0);
+
+ if (SimplifyDemandedBits(Op0, ~SignMask & DemandedBits, DemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
+
+ if (Known.isNonNegative())
+ return TLO.CombineTo(Op, Op0);
+ if (Known.isNegative())
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::FNEG, dl, VT, Op0));
+
+ Known.Zero |= SignMask;
+ Known.One &= ~SignMask;
+
+ break;
+ }
+ case ISD::FCOPYSIGN: {
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ APInt SignMask = APInt::getSignMask(BitWidth);
+
+ if (!DemandedBits.intersects(SignMask))
+ return TLO.CombineTo(Op, Op0);
+
+ if (SimplifyDemandedBits(Op0, ~SignMask & DemandedBits, DemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
+ if (SimplifyDemandedBits(Op1, SignMask, DemandedElts, Known2, TLO,
+ Depth + 1))
+ return true;
+
+ if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
+ return true;
+
+ if ((Known.isNonNegative() && Known2.isNonNegative()) ||
+ (Known.isNegative() && Known2.isNegative()))
+ return TLO.CombineTo(Op, Op0);
+
+ if (Known2.isNonNegative())
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::FABS, dl, VT, Op0));
+
+ if (Known2.isNegative()) {
+ Known.One |= SignMask;
+ Known.Zero &= ~SignMask;
+ }
+
+ break;
+ }
+ case ISD::FNEG: {
+ SDValue Op0 = Op.getOperand(0);
+ APInt SignMask = APInt::getSignMask(BitWidth);
+
+ if (!DemandedBits.intersects(SignMask))
+ return TLO.CombineTo(Op, Op0);
+
+ if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known, TLO,
+ Depth + 1))
+ return true;
+
+ if (Known.isNonNegative() || Known.isNegative()) {
+ Known.Zero ^= SignMask;
+ Known.One ^= SignMask;
+ }
+
+ break;
+ }
default:
// We also ask the target about intrinsics (which could be specific to it).
if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
index 5e5fdd6d31705..554b2e3444fe4 100644
--- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll
@@ -425,10 +425,7 @@ entry:
define float @extract_v4i32_copysign_build_vector_const(<4 x float> %a, <4 x float> %b, i32 %c) {
; CHECK-SD-LABEL: extract_v4i32_copysign_build_vector_const:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: adrp x8, .LCPI17_0
-; CHECK-SD-NEXT: mvni v1.4s, #128, lsl #24
-; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI17_0]
-; CHECK-SD-NEXT: bif v0.16b, v2.16b, v1.16b
+; CHECK-SD-NEXT: fabs v0.4s, v0.4s
; CHECK-SD-NEXT: mov s0, v0.s[2]
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index a597faa028f22..f03958a967328 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -427,16 +427,18 @@ entry:
define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
; GFX-942-LABEL: fptrunc_f64_to_bf16_abs:
; GFX-942: ; %bb.0: ; %entry
-; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
-; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6
-; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
-; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GFX-942-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
+; GFX-942-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1
+; GFX-942-NEXT: v_mov_b32_e32 v4, v0
+; GFX-942-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
+; GFX-942-NEXT: v_and_b32_e32 v9, 1, v8
+; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[4:5]|, |v[6:7]|
+; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[6:7]
+; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
-; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4
+; GFX-942-NEXT: v_add_u32_e32 v4, v8, v4
; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
-; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX-942-NEXT: s_movk_i32 s0, 0x7fff
; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s0
@@ -449,16 +451,18 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
;
; GFX-950-LABEL: fptrunc_f64_to_bf16_abs:
; GFX-950: ; %bb.0: ; %entry
-; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
-; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
-; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
-; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
-; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
-; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
-; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0
+; GFX-950-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
+; GFX-950-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1
+; GFX-950-NEXT: v_mov_b32_e32 v4, v0
+; GFX-950-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
+; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[6:7]
+; GFX-950-NEXT: v_and_b32_e32 v0, 1, v8
+; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[4:5]|, |v[6:7]|
+; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
-; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
+; GFX-950-NEXT: v_add_u32_e32 v0, v8, v0
+; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX-950-NEXT: flat_store_short v[2:3], v0
; GFX-950-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 2bdf994496421..951f103f7d9c4 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -18639,8 +18639,8 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -18648,8 +18648,8 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -18832,8 +18832,8 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -18843,8 +18843,8 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -18889,23 +18889,23 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GCN-NEXT: s_bitset0_b32 s0, 31
-; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
-; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX7-NEXT: s_bitset0_b32 s0, 31
-; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
-; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000
-; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_fabs_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index e9fd6119d0c36..9f9b14d1c87a0 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -211,22 +211,22 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_brev_b32 s8, -2
-; SI-NEXT: v_mov_b32_e32 v1, 0x43300000
-; SI-NEXT: v_mov_b32_e32 v0, 0
-; SI-NEXT: v_mov_b32_e32 v2, -1
-; SI-NEXT: v_mov_b32_e32 v3, 0x432fffff
+; SI-NEXT: v_mov_b32_e32 v0, -1
+; SI-NEXT: v_mov_b32_e32 v1, 0x432fffff
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: v_mov_b32_e32 v3, 0x43300000
+; SI-NEXT: s_mov_b32 s8, 0
+; SI-NEXT: s_mov_b32 s9, 0xc3300000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: v_mov_b32_e32 v6, s3
-; SI-NEXT: v_bfi_b32 v1, s8, v1, v6
-; SI-NEXT: v_mov_b32_e32 v7, s2
-; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[0:1]
-; SI-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1]
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[2:3]
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[2:3]
+; SI-NEXT: v_mov_b32_e32 v4, s3
+; SI-NEXT: v_mov_b32_e32 v5, s2
+; SI-NEXT: v_add_f64 v[2:3], v[2:3], s[8:9]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[0:1]
+; SI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -270,30 +270,31 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: s_brev_b32 s10, -2
-; SI-NEXT: v_mov_b32_e32 v6, 0x43300000
; SI-NEXT: s_mov_b32 s9, 0x432fffff
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_mov_b32 s10, 0
+; SI-NEXT: s_mov_b32 s11, 0x43300000
+; SI-NEXT: s_mov_b32 s12, 0
+; SI-NEXT: s_mov_b32 s13, 0xc3300000
; SI-NEXT: s_mov_b32 s8, s2
-; SI-NEXT: v_mov_b32_e32 v4, s8
-; SI-NEXT: v_mov_b32_e32 v5, s9
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: v_mov_b32_e32 v2, s10
+; SI-NEXT: v_mov_b32_e32 v3, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v7, s7
-; SI-NEXT: v_bfi_b32 v1, s10, v6, v7
-; SI-NEXT: v_mov_b32_e32 v8, s6
-; SI-NEXT: v_mov_b32_e32 v9, s5
-; SI-NEXT: v_mov_b32_e32 v10, s4
-; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1]
-; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1]
-; SI-NEXT: v_bfi_b32 v1, s10, v6, v9
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
-; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1]
-; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1]
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5]
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
+; SI-NEXT: v_add_f64 v[4:5], s[6:7], v[2:3]
+; SI-NEXT: v_mov_b32_e32 v8, s7
+; SI-NEXT: v_mov_b32_e32 v9, s6
+; SI-NEXT: v_add_f64 v[2:3], s[4:5], v[2:3]
+; SI-NEXT: v_mov_b32_e32 v10, s5
+; SI-NEXT: v_mov_b32_e32 v11, s4
+; SI-NEXT: v_add_f64 v[4:5], v[4:5], s[12:13]
+; SI-NEXT: v_add_f64 v[6:7], v[2:3], s[12:13]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[0:1]
+; SI-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc
+; SI-NEXT: v_cndmask_b32_e32 v2, v4, v9, vcc
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[0:1]
+; SI-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v6, v11, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -347,46 +348,45 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x11
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
-; SI-NEXT: s_brev_b32 s14, -2
-; SI-NEXT: v_mov_b32_e32 v10, 0x43300000
; SI-NEXT: s_mov_b32 s13, 0x432fffff
-; SI-NEXT: v_mov_b32_e32 v4, 0
+; SI-NEXT: s_mov_b32 s14, 0
+; SI-NEXT: s_mov_b32 s15, 0x43300000
+; SI-NEXT: s_mov_b32 s16, 0
+; SI-NEXT: s_mov_b32 s17, 0xc3300000
; SI-NEXT: s_mov_b32 s12, s10
-; SI-NEXT: v_mov_b32_e32 v8, s12
-; SI-NEXT: v_mov_b32_e32 v9, s13
+; SI-NEXT: v_mov_b32_e32 v4, s12
+; SI-NEXT: v_mov_b32_e32 v5, s13
+; SI-NEXT: v_mov_b32_e32 v0, s14
+; SI-NEXT: v_mov_b32_e32 v1, s15
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v2, s3
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v2
-; SI-NEXT: v_mov_b32_e32 v6, s2
-; SI-NEXT: v_mov_b32_e32 v7, s1
-; SI-NEXT: v_mov_b32_e32 v11, s0
-; SI-NEXT: v_mov_b32_e32 v12, s7
-; SI-NEXT: v_mov_b32_e32 v13, s6
-; SI-NEXT: v_mov_b32_e32 v14, s5
-; SI-NEXT: v_mov_b32_e32 v15, s4
-; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5]
-; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v7
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc
-; SI-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc
-; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[4:5]
-; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v12
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc
-; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5]
-; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[4:5]
-; SI-NEXT: v_bfi_b32 v5, s14, v10, v14
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc
-; SI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc
-; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[4:5]
-; SI-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5]
-; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9]
-; SI-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc
-; SI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc
+; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v10, s3
+; SI-NEXT: v_mov_b32_e32 v11, s2
+; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v12, s1
+; SI-NEXT: v_mov_b32_e32 v13, s0
+; SI-NEXT: v_add_f64 v[8:9], s[6:7], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v14, s7
+; SI-NEXT: v_mov_b32_e32 v15, s6
+; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1]
+; SI-NEXT: v_mov_b32_e32 v16, s5
+; SI-NEXT: v_add_f64 v[2:3], v[2:3], s[16:17]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc
+; SI-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc
+; SI-NEXT: v_mov_b32_e32 v17, s4
+; SI-NEXT: v_add_f64 v[6:7], v[6:7], s[16:17]
+; SI-NEXT: v_add_f64 v[8:9], v[8:9], s[16:17]
+; SI-NEXT: v_add_f64 v[10:11], v[0:1], s[16:17]
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v1, v7, v12, vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v6, v13, vcc
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v7, v9, v14, vcc
+; SI-NEXT: v_cndmask_b32_e32 v6, v8, v15, vcc
+; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5]
+; SI-NEXT: v_cndmask_b32_e32 v5, v11, v16, vcc
+; SI-NEXT: v_cndmask_b32_e32 v4, v10, v17, vcc
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; SI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index 228420ef0acb0..0b6baf4b5f504 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -559,10 +559,11 @@ define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 {
; SI-LABEL: v_rcp_fabs_f32_ieee_ulp25:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; SI-NEXT: s_mov_b32 s4, 0x7f800000
-; SI-NEXT: v_frexp_mant_f32_e64 v1, |v0|
-; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-NEXT: v_cndmask_b32_e64 v1, |v0|, v1, s[4:5]
+; SI-NEXT: v_frexp_mant_f32_e64 v2, |v0|
+; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s4
+; SI-NEXT: v_cndmask_b32_e64 v1, |v0|, v2, s[4:5]
; SI-NEXT: v_rcp_f32_e32 v1, v1
; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
@@ -700,10 +701,11 @@ define float @v_rcp_neg_fabs_f32_ieee_ulp25(float %x) #3 {
; SI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; SI-NEXT: s_mov_b32 s4, 0x7f800000
-; SI-NEXT: v_frexp_mant_f32_e64 v1, -|v0|
-; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-NEXT: v_cndmask_b32_e64 v1, -|v0|, v1, s[4:5]
+; SI-NEXT: v_frexp_mant_f32_e64 v2, -|v0|
+; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s4
+; SI-NEXT: v_cndmask_b32_e64 v1, -|v0|, v2, s[4:5]
; SI-NEXT: v_rcp_f32_e32 v1, v1
; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index 59a1fe041bf90..5ebc08cf8402c 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -1124,14 +1124,14 @@ define double @v_roundeven_f64(double %x) {
; SDAG_GFX6-LABEL: v_roundeven_f64:
; SDAG_GFX6: ; %bb.0:
; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX6-NEXT: s_brev_b32 s6, -2
-; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0x43300000
-; SDAG_GFX6-NEXT: v_bfi_b32 v3, s6, v2, v1
-; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0
+; SDAG_GFX6-NEXT: s_mov_b32 s6, 0
+; SDAG_GFX6-NEXT: s_mov_b32 s7, 0x43300000
; SDAG_GFX6-NEXT: s_mov_b32 s4, -1
-; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[0:1], v[2:3]
+; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[0:1], s[6:7]
+; SDAG_GFX6-NEXT: s_mov_b32 s6, 0
; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff
-; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3]
+; SDAG_GFX6-NEXT: s_mov_b32 s7, 0xc3300000
+; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[2:3], s[6:7]
; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -1208,18 +1208,18 @@ define double @v_roundeven_f64_fneg(double %x) {
; SDAG_GFX6-LABEL: v_roundeven_f64_fneg:
; SDAG_GFX6: ; %bb.0:
; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v1
-; SDAG_GFX6-NEXT: s_brev_b32 s4, -2
-; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0x43300000
-; SDAG_GFX6-NEXT: v_bfi_b32 v3, s4, v2, v6
-; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0
-; SDAG_GFX6-NEXT: v_add_f64 v[4:5], -v[0:1], v[2:3]
+; SDAG_GFX6-NEXT: s_mov_b32 s6, 0
+; SDAG_GFX6-NEXT: s_mov_b32 s7, 0x43300000
; SDAG_GFX6-NEXT: s_mov_b32 s4, -1
+; SDAG_GFX6-NEXT: v_add_f64 v[2:3], -v[0:1], s[6:7]
+; SDAG_GFX6-NEXT: s_mov_b32 s6, 0
; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff
-; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3]
+; SDAG_GFX6-NEXT: s_mov_b32 s7, 0xc3300000
+; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[2:3], s[6:7]
; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; SDAG_GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v1
; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc
+; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX7-LABEL: v_roundeven_f64_fneg:
@@ -1304,20 +1304,19 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) {
; SDAG_GFX6-LABEL: v_roundeven_v2f64:
; SDAG_GFX6: ; %bb.0:
; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX6-NEXT: s_brev_b32 s6, -2
-; SDAG_GFX6-NEXT: v_mov_b32_e32 v8, 0x43300000
-; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v8, v1
-; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0
-; SDAG_GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5]
+; SDAG_GFX6-NEXT: s_mov_b32 s6, 0
+; SDAG_GFX6-NEXT: s_mov_b32 s7, 0x43300000
; SDAG_GFX6-NEXT: s_mov_b32 s4, -1
+; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[0:1], s[6:7]
+; SDAG_GFX6-NEXT: s_mov_b32 s8, 0
; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff
-; SDAG_GFX6-NEXT: v_add_f64 v[5:6], v[6:7], -v[4:5]
+; SDAG_GFX6-NEXT: s_mov_b32 s9, 0xc3300000
+; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[4:5], s[8:9]
; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
-; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v8, v3
-; SDAG_GFX6-NEXT: v_add_f64 v[7:8], v[2:3], v[4:5]
-; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
-; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[7:8], -v[4:5]
+; SDAG_GFX6-NEXT: v_add_f64 v[6:7], v[2:3], s[6:7]
+; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[6:7], s[8:9]
; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[4:5]
; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
index 7ed27f008083e..8691925797621 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll
@@ -776,20 +776,16 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
; CI-LABEL: add_select_fabs_negk_negk_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
+; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; CI-NEXT: v_add_f32_e32 v0, v0, v2
-; CI-NEXT: v_add_f32_e32 v1, v1, v3
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
+; CI-NEXT: v_sub_f32_e32 v1, v3, v1
+; CI-NEXT: v_sub_f32_e32 v0, v2, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_negk_negk_v2f16:
@@ -801,8 +797,8 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; VI-NEXT: v_add_f16_sdwa v1, |v1|, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; VI-NEXT: v_add_f16_e64 v0, |v0|, v2
+; VI-NEXT: v_sub_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT: v_sub_f16_e32 v0, v2, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
>From 6752a2975929cc5ba2df5be24ee72c2fd7bc3851 Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0 at owo.li>
Date: Sat, 10 May 2025 23:56:28 +0800
Subject: [PATCH 2/4] update
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++
.../CodeGen/SelectionDAG/TargetLowering.cpp | 14 +++----
llvm/test/CodeGen/AMDGPU/bf16-conversions.ll | 38 +++++++++----------
llvm/test/CodeGen/AMDGPU/rcp-pattern.ll | 14 +++----
4 files changed, 33 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 39c883148ad02..788f21d7596f7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -19029,6 +19029,10 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
if (SDValue C = DAG.FoldConstantArithmetic(ISD::FABS, DL, VT, {N0}))
return C;
+ // fold (fabs (fabs x)) -> (fabs x)
+ if (N0.getOpcode() == ISD::FABS)
+ return N->getOperand(0);
+
if (SimplifyDemandedBits(N0,
APInt::getSignedMaxValue(VT.getScalarSizeInBits())))
return SDValue(N, 0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 971ed5d97d523..6dde51dc16513 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2980,7 +2980,8 @@ bool TargetLowering::SimplifyDemandedBits(
if (Known.isNonNegative())
return TLO.CombineTo(Op, Op0);
if (Known.isNegative())
- return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::FNEG, dl, VT, Op0));
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(ISD::FNEG, dl, VT, Op0, Op->getFlags()));
Known.Zero |= SignMask;
Known.One &= ~SignMask;
@@ -2996,21 +2997,18 @@ bool TargetLowering::SimplifyDemandedBits(
return TLO.CombineTo(Op, Op0);
if (SimplifyDemandedBits(Op0, ~SignMask & DemandedBits, DemandedElts, Known,
- TLO, Depth + 1))
- return true;
- if (SimplifyDemandedBits(Op1, SignMask, DemandedElts, Known2, TLO,
+ TLO, Depth + 1) ||
+ SimplifyDemandedBits(Op1, SignMask, DemandedElts, Known2, TLO,
Depth + 1))
return true;
- if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
- return true;
-
if ((Known.isNonNegative() && Known2.isNonNegative()) ||
(Known.isNegative() && Known2.isNegative()))
return TLO.CombineTo(Op, Op0);
if (Known2.isNonNegative())
- return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::FABS, dl, VT, Op0));
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(ISD::FABS, dl, VT, Op0, Op->getFlags()));
if (Known2.isNegative()) {
Known.One |= SignMask;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index f03958a967328..a597faa028f22 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -427,18 +427,16 @@ entry:
define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
; GFX-942-LABEL: fptrunc_f64_to_bf16_abs:
; GFX-942: ; %bb.0: ; %entry
-; GFX-942-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
-; GFX-942-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1
-; GFX-942-NEXT: v_mov_b32_e32 v4, v0
-; GFX-942-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
-; GFX-942-NEXT: v_and_b32_e32 v9, 1, v8
-; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[4:5]|, |v[6:7]|
-; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[6:7]
-; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
+; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6
+; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
+; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
-; GFX-942-NEXT: v_add_u32_e32 v4, v8, v4
+; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4
; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc
-; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX-942-NEXT: s_movk_i32 s0, 0x7fff
; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s0
@@ -451,18 +449,16 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
;
; GFX-950-LABEL: fptrunc_f64_to_bf16_abs:
; GFX-950: ; %bb.0: ; %entry
-; GFX-950-NEXT: v_cvt_f32_f64_e64 v8, |v[0:1]|
-; GFX-950-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1
-; GFX-950-NEXT: v_mov_b32_e32 v4, v0
-; GFX-950-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
-; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[6:7]
-; GFX-950-NEXT: v_and_b32_e32 v0, 1, v8
-; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[4:5]|, |v[6:7]|
-; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
+; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6
+; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
+; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
-; GFX-950-NEXT: v_add_u32_e32 v0, v8, v0
-; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0
+; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc
+; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
; GFX-950-NEXT: flat_store_short v[2:3], v0
; GFX-950-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index 0b6baf4b5f504..228420ef0acb0 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -559,11 +559,10 @@ define float @v_rcp_fabs_f32_ieee_ulp25(float %x) #3 {
; SI-LABEL: v_rcp_fabs_f32_ieee_ulp25:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; SI-NEXT: s_mov_b32 s4, 0x7f800000
-; SI-NEXT: v_frexp_mant_f32_e64 v2, |v0|
-; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s4
-; SI-NEXT: v_cndmask_b32_e64 v1, |v0|, v2, s[4:5]
+; SI-NEXT: v_frexp_mant_f32_e64 v1, |v0|
+; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; SI-NEXT: v_cndmask_b32_e64 v1, |v0|, v1, s[4:5]
; SI-NEXT: v_rcp_f32_e32 v1, v1
; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
@@ -701,11 +700,10 @@ define float @v_rcp_neg_fabs_f32_ieee_ulp25(float %x) #3 {
; SI-LABEL: v_rcp_neg_fabs_f32_ieee_ulp25:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
; SI-NEXT: s_mov_b32 s4, 0x7f800000
-; SI-NEXT: v_frexp_mant_f32_e64 v2, -|v0|
-; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s4
-; SI-NEXT: v_cndmask_b32_e64 v1, -|v0|, v2, s[4:5]
+; SI-NEXT: v_frexp_mant_f32_e64 v1, -|v0|
+; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; SI-NEXT: v_cndmask_b32_e64 v1, -|v0|, v1, s[4:5]
; SI-NEXT: v_rcp_f32_e32 v1, v1
; SI-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
; SI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
>From d19201ef4671e2ccc5990c00ed362602659a73a0 Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0 at owo.li>
Date: Wed, 11 Jun 2025 21:06:51 +0800
Subject: [PATCH 3/4] fix tests
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 20 ++++++++++++-------
llvm/test/CodeGen/AMDGPU/fabs.bf16.ll | 4 ++--
llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll | 19 +++++++++---------
3 files changed, 24 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 6dde51dc16513..36923aa24e4ea 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2991,14 +2991,20 @@ bool TargetLowering::SimplifyDemandedBits(
case ISD::FCOPYSIGN: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
- APInt SignMask = APInt::getSignMask(BitWidth);
- if (!DemandedBits.intersects(SignMask))
+ unsigned BitWidth0 = Op0.getScalarValueSizeInBits();
+ unsigned BitWidth1 = Op1.getScalarValueSizeInBits();
+ APInt SignMask0 = APInt::getSignMask(BitWidth0);
+ APInt SignMask1 = APInt::getSignMask(BitWidth1);
+
+ if (!DemandedBits.intersects(SignMask0))
return TLO.CombineTo(Op, Op0);
- if (SimplifyDemandedBits(Op0, ~SignMask & DemandedBits, DemandedElts, Known,
- TLO, Depth + 1) ||
- SimplifyDemandedBits(Op1, SignMask, DemandedElts, Known2, TLO,
+ APInt ScalarDemandedBits = DemandedBits.trunc(BitWidth0);
+
+ if (SimplifyDemandedBits(Op0, ~SignMask0 & ScalarDemandedBits, DemandedElts,
+ Known, TLO, Depth + 1) ||
+ SimplifyDemandedBits(Op1, SignMask1, DemandedElts, Known2, TLO,
Depth + 1))
return true;
@@ -3011,8 +3017,8 @@ bool TargetLowering::SimplifyDemandedBits(
Op, TLO.DAG.getNode(ISD::FABS, dl, VT, Op0, Op->getFlags()));
if (Known2.isNegative()) {
- Known.One |= SignMask;
- Known.Zero &= ~SignMask;
+ Known.One |= SignMask0;
+ Known.Zero &= ~SignMask0;
}
break;
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
index 71b1a16c79e69..1be99af42b8f8 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
@@ -220,7 +220,7 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s4, s3, 0xffff0000
; CI-NEXT: s_lshl_b32 s3, s3, 16
-; CI-NEXT: s_and_b32 s5, s2, 0xffff0000
+; CI-NEXT: s_and_b32 s5, s2, 0x7fff0000
; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4|
; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s3|
; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s5|
@@ -944,7 +944,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2bf16(ptr addrspace(1) %in) #0 {
; CI-NEXT: flat_load_dword v0, v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; CI-NEXT: v_mul_f32_e64 v1, 1.0, |v1|
; CI-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index d189b6d4c1e83..8e2a187a71384 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -481,17 +481,16 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
; CI-NEXT: s_lshl_b32 s2, s2, 16
; CI-NEXT: v_add_f32_e64 v0, s3, 2.0
; CI-NEXT: v_add_f32_e64 v1, s2, 1.0
-; CI-NEXT: v_readfirstlane_b32 s2, v0
-; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT: s_bitset0_b32 s2, 31
-; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1
-; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
-; CI-NEXT: s_xor_b32 s2, s2, 0x80000000
+; CI-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; CI-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; CI-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; CI-NEXT: v_mul_f32_e64 v1, 1.0, |v1|
; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; CI-NEXT: s_lshr_b32 s2, s2, 16
; CI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; CI-NEXT: v_alignbit_b32 v2, s2, v0, 16
+; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -676,7 +675,7 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshl_b32 s4, s2, 16
-; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
+; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000
; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s2|
; CI-NEXT: s_and_b32 s2, s3, 0xffff0000
; CI-NEXT: s_lshl_b32 s5, s3, 16
>From aff60826bc7b2f90004a051801ec32a18064c4ac Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0 at owo.li>
Date: Wed, 11 Jun 2025 21:52:59 +0800
Subject: [PATCH 4/4] fix incorrect desired bits in `visitFABS`
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 +-
llvm/test/CodeGen/AMDGPU/bf16.ll | 36 +++++++++----------
llvm/test/CodeGen/AMDGPU/fabs.bf16.ll | 4 +--
llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll | 19 +++++-----
4 files changed, 31 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 788f21d7596f7..356375bf658ca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -19033,8 +19033,7 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
if (N0.getOpcode() == ISD::FABS)
return N->getOperand(0);
- if (SimplifyDemandedBits(N0,
- APInt::getSignedMaxValue(VT.getScalarSizeInBits())))
+ if (SimplifyDemandedBits(N0, APInt::getAllOnes(VT.getScalarSizeInBits())))
return SDValue(N, 0);
if (SDValue Cast = foldSignChangeInBitcast(N))
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 951f103f7d9c4..2bdf994496421 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -18639,8 +18639,8 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -18648,8 +18648,8 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -18832,8 +18832,8 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -18843,8 +18843,8 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -18889,23 +18889,23 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GCN-NEXT: s_bitset0_b32 s0, 31
+; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GCN-NEXT: s_lshr_b32 s0, s0, 16
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX7-NEXT: s_bitset0_b32 s0, 31
+; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_fabs_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
index 1be99af42b8f8..71b1a16c79e69 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll
@@ -220,7 +220,7 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s4, s3, 0xffff0000
; CI-NEXT: s_lshl_b32 s3, s3, 16
-; CI-NEXT: s_and_b32 s5, s2, 0x7fff0000
+; CI-NEXT: s_and_b32 s5, s2, 0xffff0000
; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4|
; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s3|
; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s5|
@@ -944,7 +944,7 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2bf16(ptr addrspace(1) %in) #0 {
; CI-NEXT: flat_load_dword v0, v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
-; CI-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; CI-NEXT: v_mul_f32_e64 v1, 1.0, |v1|
; CI-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
index 8e2a187a71384..d189b6d4c1e83 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll
@@ -481,16 +481,17 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
; CI-NEXT: s_lshl_b32 s2, s2, 16
; CI-NEXT: v_add_f32_e64 v0, s3, 2.0
; CI-NEXT: v_add_f32_e64 v1, s2, 1.0
-; CI-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
-; CI-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
-; CI-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
-; CI-NEXT: v_mul_f32_e64 v1, 1.0, |v1|
+; CI-NEXT: v_readfirstlane_b32 s2, v0
+; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
+; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; CI-NEXT: s_bitset0_b32 s2, 31
+; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1
+; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
+; CI-NEXT: s_xor_b32 s2, s2, 0x80000000
; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; CI-NEXT: s_lshr_b32 s2, s2, 16
; CI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16
+; CI-NEXT: v_alignbit_b32 v2, s2, v0, 16
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -675,7 +676,7 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat>
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshl_b32 s4, s2, 16
-; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000
+; CI-NEXT: s_and_b32 s2, s2, 0xffff0000
; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s2|
; CI-NEXT: s_and_b32 s2, s3, 0xffff0000
; CI-NEXT: s_lshl_b32 s5, s3, 16
More information about the llvm-commits
mailing list