[llvm] [SelectionDAG] Support sign tracking trough `{S|U}INT_TO_FP` (PR #82808)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 23 10:23:36 PST 2024
https://github.com/goldsteinn created https://github.com/llvm/llvm-project/pull/82808
Just a minimal amount of easily provable tracking.
Proofs: https://alive2.llvm.org/ce/z/RQYbdw
Alive2 to has an issue with `(sitofp i1)`, but it can
be verified by hand: https://godbolt.org/z/qKr7hT7s9
>From 1849da88b8886cd7ed9307dfb7b4958d51e0de72 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Fri, 23 Feb 2024 12:18:50 -0600
Subject: [PATCH] [SelectionDAG] Support sign tracking trough `{S|U}INT_TO_FP`
Just a minimal amount of easily provable tracking.
Proofs: https://alive2.llvm.org/ce/z/RQYbdw
Alive2 to has an issue with `(sitofp i1)`, but it can
be verified by hand: https://godbolt.org/z/qKr7hT7s9
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 13 +++
.../fold-int-pow2-with-fmul-or-fdiv.ll | 10 +--
llvm/test/CodeGen/AMDGPU/bf16.ll | 80 ++++++++---------
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 85 ++-----------------
.../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 54 +++---------
5 files changed, 76 insertions(+), 166 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index add92cf8b31e44..dbd60fcce01888 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4004,6 +4004,19 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
break;
}
+ case ISD::UINT_TO_FP: {
+ Known.makeNonNegative();
+ break;
+ }
+ case ISD::SINT_TO_FP: {
+ Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ if (Known2.isNonNegative())
+ Known.makeNonNegative();
+ // For fp -> sint, need to guard -0 case.
+ else if (Known2.isNegative())
+ Known.makeNegative();
+ break;
+ }
case ISD::FP_TO_UINT_SAT: {
// FP_TO_UINT_SAT produces an unsigned value that fits in the saturating VT.
EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
index 8f3100c82772ba..ad062da5491da4 100644
--- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -525,12 +525,10 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
; CHECK-LABEL: fdiv_pow_shl_cnt:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8 // =0x8
-; CHECK-NEXT: and x9, x0, #0x1f
-; CHECK-NEXT: fmov s1, #-0.50000000
-; CHECK-NEXT: lsl x8, x8, x9
-; CHECK-NEXT: scvtf s0, x8
-; CHECK-NEXT: fdiv s0, s1, s0
+; CHECK-NEXT: mov w8, #-1115684864 // =0xbd800000
+; CHECK-NEXT: and w9, w0, #0x1f
+; CHECK-NEXT: sub w8, w8, w9, lsl #23
+; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ret
%cnt = and i64 %cnt_in, 31
%shl = shl i64 8, %cnt
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 8ec7dfd93cd098..c0b5818340c826 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -32379,7 +32379,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_i16_to_bf16:
@@ -32387,7 +32387,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_i16_to_bf16:
@@ -32455,8 +32455,8 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v2i16_to_v2bf16:
@@ -32466,8 +32466,8 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v2i16_to_v2bf16:
@@ -32566,9 +32566,9 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -32580,9 +32580,9 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -32682,10 +32682,10 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v4i16_to_v4bf16:
@@ -32699,10 +32699,10 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v4i16_to_v4bf16:
@@ -32857,14 +32857,14 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_i32_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_i32_to_bf16:
@@ -32928,8 +32928,8 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v2i32_to_v2bf16:
@@ -32937,8 +32937,8 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v2i32_to_v2bf16:
@@ -33031,9 +33031,9 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -33042,9 +33042,9 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -33140,10 +33140,10 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_v4i32_to_v4bf16:
@@ -33153,10 +33153,10 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
; GFX7-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_u32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_u32_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0x7fff0000, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_v4i32_to_v4bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index b261b3129f3fe1..7c5f6d5e33efe7 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -2532,58 +2532,16 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 31, v0
-; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8
-; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; VI-NEXT: v_ffbh_i32_e32 v3, v1
-; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
-; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v3
-; VI-NEXT: v_min_u32_e32 v2, v3, v2
-; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
-; VI-NEXT: v_min_u32_e32 v0, 1, v0
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
-; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
-; VI-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -0.5
-; VI-NEXT: v_div_scale_f32 v2, vcc, -0.5, v0, -0.5
-; VI-NEXT: v_rcp_f32_e32 v3, v1
-; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; VI-NEXT: v_fma_f32 v3, v4, v3, v3
-; VI-NEXT: v_mul_f32_e32 v4, v2, v3
-; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
-; VI-NEXT: v_fma_f32 v4, v5, v3, v4
-; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
-; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v0, v1, v0, -0.5
+; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
+; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xbd800000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 31, v0
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8
-; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; GFX10-NEXT: v_ffbh_i32_e32 v3, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v2, 32, v2
-; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v3
-; GFX10-NEXT: v_min_u32_e32 v2, v3, v2
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
-; GFX10-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, -0.5
-; GFX10-NEXT: v_rcp_f32_e32 v2, v1
-; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, -0.5, v0, -0.5
-; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2
-; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, -0.5
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0xbd800000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt:
@@ -2591,39 +2549,8 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 31, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8
-; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cls_i32_e32 v3, v1
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 32, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v3
-; GFX11-NEXT: v_min_u32_e32 v2, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, -0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rcp_f32_e32 v2, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, -0.5, v0, -0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2
-; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, -0.5
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0
+; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0xbd800000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cnt = and i64 %cnt_in, 31
%shl = shl i64 8, %cnt
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 2001fddfaac401..5f326b6d6998fb 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1368,49 +1368,21 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt:
; CHECK-SSE: # %bb.0:
-; CHECK-SSE-NEXT: movq %rdi, %rcx
-; CHECK-SSE-NEXT: andb $31, %cl
-; CHECK-SSE-NEXT: movl $8, %eax
-; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
-; CHECK-SSE-NEXT: shlq %cl, %rax
-; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-SSE-NEXT: divss %xmm1, %xmm0
+; CHECK-SSE-NEXT: andl $31, %edi
+; CHECK-SSE-NEXT: shll $23, %edi
+; CHECK-SSE-NEXT: movl $-1115684864, %eax # imm = 0xBD800000
+; CHECK-SSE-NEXT: subl %edi, %eax
+; CHECK-SSE-NEXT: movd %eax, %xmm0
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt:
-; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: movq %rdi, %rcx
-; CHECK-AVX2-NEXT: andb $31, %cl
-; CHECK-AVX2-NEXT: movl $8, %eax
-; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; CHECK-AVX2-NEXT: shlq %cl, %rax
-; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT: retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt:
-; CHECK-NO-FASTFMA: # %bb.0:
-; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT: andb $31, %cl
-; CHECK-NO-FASTFMA-NEXT: movl $8, %eax
-; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT: retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt:
-; CHECK-FMA: # %bb.0:
-; CHECK-FMA-NEXT: andb $31, %dil
-; CHECK-FMA-NEXT: movl $8, %eax
-; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-AVX-LABEL: fdiv_pow_shl_cnt:
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: andl $31, %edi
+; CHECK-AVX-NEXT: shll $23, %edi
+; CHECK-AVX-NEXT: movl $-1115684864, %eax # imm = 0xBD800000
+; CHECK-AVX-NEXT: subl %edi, %eax
+; CHECK-AVX-NEXT: vmovd %eax, %xmm0
+; CHECK-AVX-NEXT: retq
%cnt = and i64 %cnt_in, 31
%shl = shl i64 8, %cnt
%conv = sitofp i64 %shl to float
More information about the llvm-commits
mailing list