[llvm] ddc0f1d - [TargetLowering] Actually add the adjustment to the significand
David Majnemer via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 21 11:35:21 PST 2024
Author: David Majnemer
Date: 2024-02-21T19:34:11Z
New Revision: ddc0f1d8fed4f1a1742598ffd7dc3195bb37a8f1
URL: https://github.com/llvm/llvm-project/commit/ddc0f1d8fed4f1a1742598ffd7dc3195bb37a8f1
DIFF: https://github.com/llvm/llvm-project/commit/ddc0f1d8fed4f1a1742598ffd7dc3195bb37a8f1.diff
LOG: [TargetLowering] Actually add the adjustment to the significand
The logic was supposed to be choosing between {0, 1, -1} as an
adjustment to the FP bit pattern. However, the adjustment itself was
used as the bit pattern instead which result in garbage results.
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/AMDGPU/bf16.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index d059dc66d05884..bde1fff4e1ca74 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10895,15 +10895,17 @@ SDValue TargetLowering::expandRoundInexactToOdd(EVT ResultVT, SDValue Op,
EVT ResultIntVTCCVT = getSetCCResultType(
DAG.getDataLayout(), *DAG.getContext(), And.getValueType());
SDValue Zero = DAG.getConstant(0, dl, ResultIntVT);
+ // The result is already odd so we don't need to do anything.
SDValue AlreadyOdd = DAG.getSetCC(dl, ResultIntVTCCVT, And, Zero, ISD::SETNE);
EVT WideSetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
AbsWide.getValueType());
+ // We keep results which are exact, odd or NaN.
SDValue KeepNarrow =
DAG.getSetCC(dl, WideSetCCVT, AbsWide, AbsNarrowAsWide, ISD::SETUEQ);
KeepNarrow = DAG.getNode(ISD::OR, dl, WideSetCCVT, KeepNarrow, AlreadyOdd);
- // We morally performed a round-down if `abs_narrow` is smaller than
- // `abs_wide`.
+ // We morally performed a round-down if AbsNarrow is smaller than
+ // AbsWide.
SDValue NarrowIsRd =
DAG.getSetCC(dl, WideSetCCVT, AbsWide, AbsNarrowAsWide, ISD::SETOGT);
// If the narrow value is odd or exact, pick it.
@@ -10911,12 +10913,13 @@ SDValue TargetLowering::expandRoundInexactToOdd(EVT ResultVT, SDValue Op,
// or rounded-down value. If narrow is the rounded-down value, we want
// the rounded-up value as it will be odd.
SDValue Adjust = DAG.getSelect(dl, ResultIntVT, NarrowIsRd, One, NegativeOne);
- Adjust = DAG.getSelect(dl, ResultIntVT, KeepNarrow, Zero, Adjust);
+ SDValue Adjusted = DAG.getNode(ISD::ADD, dl, ResultIntVT, NarrowBits, Adjust);
+ Op = DAG.getSelect(dl, ResultIntVT, KeepNarrow, NarrowBits, Adjusted);
int ShiftAmount = BitSize - ResultVT.getScalarSizeInBits();
SDValue ShiftCnst = DAG.getShiftAmountConstant(ShiftAmount, WideIntVT, dl);
SignBit = DAG.getNode(ISD::SRL, dl, WideIntVT, SignBit, ShiftCnst);
SignBit = DAG.getNode(ISD::TRUNCATE, dl, ResultIntVT, SignBit);
- Op = DAG.getNode(ISD::OR, dl, ResultIntVT, Adjust, SignBit);
+ Op = DAG.getNode(ISD::OR, dl, ResultIntVT, Op, SignBit);
return DAG.getNode(ISD::BITCAST, dl, ResultVT, Op);
}
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index e841a8867fc522..67538f26c550bd 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2281,13 +2281,14 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX8-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
; GFX8-NEXT: v_and_b32_e32 v7, 0x80000000, v1
; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX8-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
-; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
-; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
+; GFX8-NEXT: v_and_b32_e32 v8, 1, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, v[4:5]
+; GFX8-NEXT: v_cmp_nlg_f64_e64 s[6:7], |v[0:1]|, v[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[4:5]
+; GFX8-NEXT: v_add_u32_e64 v4, s[4:5], v6, v4
+; GFX8-NEXT: s_or_b64 vcc, s[6:7], vcc
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX8-NEXT: v_or_b32_e32 v5, v4, v7
; GFX8-NEXT: v_bfe_u32 v4, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
@@ -2310,14 +2311,15 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX9-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
-; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
+; GFX9-NEXT: v_and_b32_e32 v7, 1, v6
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, v[4:5]
-; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX9-NEXT: v_cmp_nlg_f64_e64 s[4:5], |v[0:1]|, v[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7]
-; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v4, v6, v4
+; GFX9-NEXT: s_or_b64 vcc, s[4:5], vcc
+; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
; GFX9-NEXT: v_and_or_b32 v5, v1, s8, v4
; GFX9-NEXT: v_bfe_u32 v4, v4, 16, 1
; GFX9-NEXT: v_add3_u32 v4, v4, v5, s9
@@ -2335,15 +2337,16 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX10-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
+; GFX10-NEXT: v_and_b32_e32 v7, 1, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX10-NEXT: v_cmp_gt_f64_e64 s5, |v[0:1]|, v[4:5]
-; GFX10-NEXT: s_or_b32 s4, s4, vcc_lo
-; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
+; GFX10-NEXT: v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s4
+; GFX10-NEXT: s_or_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0x400000
+; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
; GFX10-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
; GFX10-NEXT: v_bfe_u32 v4, v4, 16, 1
; GFX10-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
@@ -2360,23 +2363,24 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
; GFX11-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
-; GFX11-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11-NEXT: v_and_b32_e32 v7, 1, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
; GFX11-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
-; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo
-; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_or_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0x400000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v1, v4
; GFX11-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v4, v4, v5, 0x7fff
; GFX11-NEXT: v_and_or_b32 v5, 0x80000000, v5, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
More information about the llvm-commits
mailing list