[llvm] [SelectionDAG] Use getAllOnes to evaluate isKnownNeverZero (PR #92923)
via llvm-commits
llvm-commits at lists.llvm.org
Tue May 21 08:13:48 PDT 2024
https://github.com/AtariDreams created https://github.com/llvm/llvm-project/pull/92923
This check was ported from ValueTracking to SelectionDAG.
>From 7e2153e3cfc67bd0452e2b008d3ba5dc92184d32 Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Tue, 21 May 2024 10:03:46 -0400
Subject: [PATCH] [SelectionDAG] Use getAllOnes to complete SHL
isKnownNeverZero check
This check was ported from ValueTracking to SelectionDAG.
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 10 ++-
.../fold-int-pow2-with-fmul-or-fdiv.ll | 8 +-
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 34 ++++----
llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 53 +++++--------
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 77 ++-----------------
.../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll | 4 -
.../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 66 +++-------------
llvm/test/CodeGen/X86/known-pow2.ll | 44 +++++------
8 files changed, 79 insertions(+), 217 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 777bbf071732e..78ba282e20a58 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5485,8 +5485,14 @@ bool SelectionDAG::isKnownNeverZero(SDValue Op, unsigned Depth) const {
return true;
// If max shift cnt of known ones is non-zero, result is non-zero.
APInt MaxCnt = computeKnownBits(Op.getOperand(1), Depth + 1).getMaxValue();
- if (MaxCnt.ult(ValKnown.getBitWidth()) &&
- !ValKnown.One.shl(MaxCnt).isZero())
+ unsigned NumBits = ValKnown.getBitWidth();
+ if (MaxCnt.ult(NumBits) && !ValKnown.One.shl(MaxCnt).isZero())
+ return true;
+
+ // If all of the bits shifted out are known to be zero, and Val is known
+ // non-zero then at least one non-zero bit must remain.
+ if (ValKnown.Zero.lshr(NumBits - MaxCnt)
+ .eq(APInt::getAllOnes(NumBits).lshr(NumBits - MaxCnt)))
return true;
break;
}
diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
index a78addc490086..f34dfbd48a3a6 100644
--- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -495,11 +495,9 @@ define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nou
define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
; CHECK-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
; CHECK: // %bb.0:
-; CHECK-NEXT: mov w8, #8 // =0x8
-; CHECK-NEXT: fmov s1, #-9.00000000
-; CHECK-NEXT: lsl x8, x8, x0
-; CHECK-NEXT: ucvtf s0, x8
-; CHECK-NEXT: fdiv s0, s1, s0
+; CHECK-NEXT: mov w8, #-1081081856 // =0xbf900000
+; CHECK-NEXT: sub w8, w8, w0, lsl #23
+; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: ret
%shl = shl i64 8, %cnt
%conv = uitofp i64 %shl to float
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 756b819099682..4314605b2b9f4 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -804,7 +804,6 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_ffbh_u32_e32 v0, v0
-; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -839,8 +838,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_min_u32_e32 v2, 32, v0
+; VI-NEXT: v_ffbh_u32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -850,7 +848,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -863,10 +861,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
-; EG-NEXT: FFBH_UINT * T1.W, PV.W,
-; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
+; EG-NEXT: FFBH_UINT T0.X, PV.W,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select:
; GFX9-GISEL: ; %bb.0:
@@ -940,7 +937,6 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v1
; SI-NEXT: v_min_u32_e32 v0, v1, v0
-; SI-NEXT: v_min_u32_e32 v0, 64, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -1011,7 +1007,6 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_min_u32_e32 v0, v0, v3
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_min_u32_e32 v0, 64, v0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1020,27 +1015,24 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 3 @6
-; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 12, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
; EG-NEXT: VTX_READ_16 T2.X, T0.X, 4, #1
-; EG-NEXT: VTX_READ_16 T3.X, T0.X, 6, #1
-; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
+; EG-NEXT: VTX_READ_16 T3.X, T0.X, 0, #1
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 6, #1
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
-; EG-NEXT: FFBH_UINT T1.W, PV.W,
-; EG-NEXT: LSHL * T2.W, T3.X, literal.x,
+; EG-NEXT: LSHL T0.W, T1.X, literal.x,
+; EG-NEXT: LSHL * T1.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: CNDE_INT T0.W, T0.W, literal.x, PV.W,
-; EG-NEXT: OR_INT * T1.W, PS, T2.X,
-; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, PV.W, T3.X,
+; EG-NEXT: FFBH_UINT T0.W, PV.W,
+; EG-NEXT: OR_INT * T1.W, T1.W, T2.X,
; EG-NEXT: FFBH_UINT T2.W, PS,
; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 392a44318b0a5..32de87357f86f 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -770,7 +770,6 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_ffbl_b32_e32 v0, v0
-; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -805,8 +804,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_ffbl_b32_e32 v0, v0
-; VI-NEXT: v_min_u32_e32 v2, 32, v0
+; VI-NEXT: v_ffbl_b32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -816,7 +814,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -829,10 +827,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
-; EG-NEXT: FFBL_INT * T1.W, PV.W,
-; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W,
-; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
-; EG-NEXT: 32(4.484155e-44), 2(2.802597e-45)
+; EG-NEXT: FFBL_INT T0.X, PV.W,
+; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select:
; GFX9-GISEL: ; %bb.0:
@@ -906,7 +903,6 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
; SI-NEXT: v_ffbl_b32_e32 v0, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 32, v0
; SI-NEXT: v_min_u32_e32 v0, v0, v1
-; SI-NEXT: v_min_u32_e32 v0, 64, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
@@ -977,7 +973,6 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
; VI-NEXT: v_ffbl_b32_e32 v0, v0
; VI-NEXT: v_min_u32_e32 v0, v3, v0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_min_u32_e32 v0, 64, v0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -986,27 +981,24 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 3 @6
-; EG-NEXT: ALU 15, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 12, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T1.X, T0.X, 6, #1
; EG-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1
-; EG-NEXT: VTX_READ_16 T3.X, T0.X, 2, #1
-; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1
+; EG-NEXT: VTX_READ_16 T3.X, T0.X, 4, #1
+; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1
; EG-NEXT: ALU clause starting at 14:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 15:
-; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
-; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
-; EG-NEXT: FFBL_INT T1.W, PV.W,
-; EG-NEXT: LSHL * T2.W, T3.X, literal.x,
+; EG-NEXT: LSHL T0.W, T1.X, literal.x,
+; EG-NEXT: LSHL * T1.W, T0.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
-; EG-NEXT: CNDE_INT T0.W, T0.W, literal.x, PV.W,
-; EG-NEXT: OR_INT * T1.W, PS, T2.X,
-; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: OR_INT * T0.W, PV.W, T3.X,
+; EG-NEXT: FFBL_INT T0.W, PV.W,
+; EG-NEXT: OR_INT * T1.W, T1.W, T2.X,
; EG-NEXT: FFBL_INT T2.W, PS,
; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
@@ -1130,7 +1122,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -1144,8 +1136,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
; EG-NEXT: FFBL_INT * T1.W, PV.W,
-; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W,
-; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: -1(nan), 2(2.802597e-45)
@@ -1252,7 +1242,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 6, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -1266,8 +1256,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
; EG-NEXT: FFBL_INT * T1.W, PV.W,
-; EG-NEXT: CNDE_INT * T1.W, T0.W, literal.x, PV.W,
-; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T0.X, T0.W, literal.x, PV.W,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: -1(nan), 2(2.802597e-45)
@@ -1330,7 +1318,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_ffbl_b32_e32 v0, v0
-; SI-NEXT: v_min_u32_e32 v0, 32, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
@@ -1368,7 +1355,6 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_ffbl_b32_e32 v0, v0
-; VI-NEXT: v_min_u32_e32 v0, 32, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1380,7 +1366,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
-; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 8, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -1393,9 +1379,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; EG-NEXT: LSHL * T0.W, T1.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: OR_INT * T0.W, PV.W, T0.X,
-; EG-NEXT: FFBL_INT * T1.W, PV.W,
-; EG-NEXT: CNDE_INT * T0.W, T0.W, literal.x, PV.W,
-; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT: FFBL_INT * T0.W, PV.W,
; EG-NEXT: SETNE_INT * T1.W, PV.W, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: CNDE_INT T0.X, PV.W, literal.x, T0.W,
@@ -1559,8 +1543,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0
-; VI-NEXT: v_ffbl_b32_e32 v2, v2
+; VI-NEXT: v_ffbl_b32_e32 v2, v0
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 7c5f6d5e33efe..afa797730294f 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -2249,86 +2249,23 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
; VI-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b64 v[0:1], v0, 8
-; VI-NEXT: s_mov_b32 s6, 0xc1100000
-; VI-NEXT: v_ffbh_u32_e32 v2, v1
-; VI-NEXT: v_min_u32_e32 v2, 32, v2
-; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
-; VI-NEXT: v_min_u32_e32 v0, 1, v0
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
-; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v2
-; VI-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6
-; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
-; VI-NEXT: v_rcp_f32_e32 v3, v1
-; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
-; VI-NEXT: v_fma_f32 v3, v4, v3, v3
-; VI-NEXT: v_mul_f32_e32 v4, v2, v3
-; VI-NEXT: v_fma_f32 v5, -v1, v4, v2
-; VI-NEXT: v_fma_f32 v4, v5, v3, v4
-; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
-; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v0, v1, v0, s6
+; VI-NEXT: v_lshlrev_b32_e32 v0, 23, v0
+; VI-NEXT: v_sub_u32_e32 v0, vcc, 0xbf900000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v0, 8
-; GFX10-NEXT: v_ffbh_u32_e32 v2, v1
-; GFX10-NEXT: v_min_u32_e32 v2, 32, v2
-; GFX10-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX10-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, 32, v2
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX10-NEXT: v_div_scale_f32 v1, s4, v0, v0, 0xc1100000
-; GFX10-NEXT: v_rcp_f32_e32 v2, v1
-; GFX10-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
-; GFX10-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX10-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v2
-; GFX10-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX10-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX10-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 23, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0xbf900000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v0, 8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1
-; GFX11-NEXT: v_min_u32_e32 v2, 32, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX11-NEXT: v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX11-NEXT: v_sub_nc_u32_e32 v1, 32, v2
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, v1
-; GFX11-NEXT: v_div_scale_f32 v1, null, v0, v0, 0xc1100000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rcp_f32_e32 v2, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_f32 v3, -v1, v2, 1.0
-; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, 0xc1100000, v0, 0xc1100000
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX11-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v2
-; GFX11-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX11-NEXT: v_div_fixup_f32 v0, v1, v0, 0xc1100000
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 23, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0xbf900000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl i64 8, %cnt
%conv = uitofp i64 %shl to float
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
index 9d6e0927b0dfd..d86c23e98c19f 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
@@ -119,7 +119,6 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX9-NEXT: v_ffbh_u32_e32 v0, v0
; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp
; GFX9-NEXT: v_min_u32_e32 v0, v2, v0
-; GFX9-NEXT: v_min_u32_e32 v0, 64, v0
; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -156,7 +155,6 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX10-NEXT: v_ffbh_u32_e32 v0, v0
; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
; GFX10-NEXT: v_min_u32_e32 v0, v2, v0
-; GFX10-NEXT: v_min_u32_e32 v0, 64, v0
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %arrayidx, align 1
@@ -281,7 +279,6 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX9-NEXT: v_ffbl_b32_e32 v2, v2
; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp
; GFX9-NEXT: v_min_u32_e32 v0, v0, v2
-; GFX9-NEXT: v_min_u32_e32 v0, 64, v0
; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -320,7 +317,6 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace
; GFX10-NEXT: v_ffbl_b32_e32 v2, v2
; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
; GFX10-NEXT: v_min_u32_e32 v0, v0, v2
-; GFX10-NEXT: v_min_u32_e32 v0, 64, v0
; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
%val = load i64, ptr addrspace(1) %arrayidx, align 1
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 96b2e1ef98276..ecce5af76398e 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1221,63 +1221,19 @@ define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nou
define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
; CHECK-SSE-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
; CHECK-SSE: # %bb.0:
-; CHECK-SSE-NEXT: movq %rdi, %rcx
-; CHECK-SSE-NEXT: movl $8, %eax
-; CHECK-SSE-NEXT: # kill: def $cl killed $cl killed $rcx
-; CHECK-SSE-NEXT: shlq %cl, %rax
-; CHECK-SSE-NEXT: testq %rax, %rax
-; CHECK-SSE-NEXT: js .LBB22_1
-; CHECK-SSE-NEXT: # %bb.2:
-; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT: jmp .LBB22_3
-; CHECK-SSE-NEXT: .LBB22_1:
-; CHECK-SSE-NEXT: shrq %rax
-; CHECK-SSE-NEXT: cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT: addss %xmm1, %xmm1
-; CHECK-SSE-NEXT: .LBB22_3:
-; CHECK-SSE-NEXT: movss {{.*#+}} xmm0 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-SSE-NEXT: divss %xmm1, %xmm0
+; CHECK-SSE-NEXT: shll $23, %edi
+; CHECK-SSE-NEXT: movl $-1081081856, %eax # imm = 0xBF900000
+; CHECK-SSE-NEXT: subl %edi, %eax
+; CHECK-SSE-NEXT: movd %eax, %xmm0
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX2-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
-; CHECK-AVX2: # %bb.0:
-; CHECK-AVX2-NEXT: movq %rdi, %rcx
-; CHECK-AVX2-NEXT: movl $8, %eax
-; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; CHECK-AVX2-NEXT: shlq %cl, %rax
-; CHECK-AVX2-NEXT: testq %rax, %rax
-; CHECK-AVX2-NEXT: js .LBB22_1
-; CHECK-AVX2-NEXT: # %bb.2:
-; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: jmp .LBB22_3
-; CHECK-AVX2-NEXT: .LBB22_1:
-; CHECK-AVX2-NEXT: shrq %rax
-; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
-; CHECK-AVX2-NEXT: .LBB22_3:
-; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0
-; CHECK-AVX2-NEXT: retq
-;
-; CHECK-NO-FASTFMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
-; CHECK-NO-FASTFMA: # %bb.0:
-; CHECK-NO-FASTFMA-NEXT: movq %rdi, %rcx
-; CHECK-NO-FASTFMA-NEXT: movl $8, %eax
-; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx
-; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax
-; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
-; CHECK-NO-FASTFMA-NEXT: retq
-;
-; CHECK-FMA-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
-; CHECK-FMA: # %bb.0:
-; CHECK-FMA-NEXT: movl $8, %eax
-; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax
-; CHECK-FMA-NEXT: vcvtusi2ss %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0
-; CHECK-FMA-NEXT: retq
+; CHECK-AVX-LABEL: fdiv_pow_shl_cnt_fail_maybe_z:
+; CHECK-AVX: # %bb.0:
+; CHECK-AVX-NEXT: shll $23, %edi
+; CHECK-AVX-NEXT: movl $-1081081856, %eax # imm = 0xBF900000
+; CHECK-AVX-NEXT: subl %edi, %eax
+; CHECK-AVX-NEXT: vmovd %eax, %xmm0
+; CHECK-AVX-NEXT: retq
%shl = shl i64 8, %cnt
%conv = uitofp i64 %shl to float
%mul = fdiv float -9.000000e+00, %conv
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index e183bbc15617d..97e90d053101e 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -83,11 +83,12 @@ define i1 @pow2_shl_fail1(i32 %x, i32 %y) {
; CHECK-LABEL: pow2_shl_fail1:
; CHECK: # %bb.0:
; CHECK-NEXT: movl %esi, %ecx
-; CHECK-NEXT: notl %edi
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT: shrl %cl, %edi
-; CHECK-NEXT: testb $4, %dil
-; CHECK-NEXT: sete %al
+; CHECK-NEXT: shrl %cl, %eax
+; CHECK-NEXT: andl $4, %eax
+; CHECK-NEXT: shrl $2, %eax
+; CHECK-NEXT: # kill: def $al killed $al killed $eax
; CHECK-NEXT: retq
%d = shl i32 4, %y
%and = and i32 %x, %d
@@ -282,9 +283,8 @@ define i1 @pow2_umin_fail0(i32 %x, i32 %y) {
; CHECK-NEXT: cmpl $262144, %eax # imm = 0x40000
; CHECK-NEXT: movl $262144, %ecx # imm = 0x40000
; CHECK-NEXT: cmovbl %eax, %ecx
-; CHECK-NEXT: notl %edi
-; CHECK-NEXT: testl %edi, %ecx
-; CHECK-NEXT: sete %al
+; CHECK-NEXT: testl %ecx, %edi
+; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
%yy = shl i32 4, %y
%d = call i32 @llvm.umin.i32(i32 %yy, i32 262144)
@@ -373,9 +373,8 @@ define i1 @pow2_umax_fail1(i32 %x, i32 %y, i32 %z) {
; CHECK-NEXT: shrl %cl, %esi
; CHECK-NEXT: cmpl %esi, %eax
; CHECK-NEXT: cmoval %eax, %esi
-; CHECK-NEXT: notl %edi
-; CHECK-NEXT: testl %edi, %esi
-; CHECK-NEXT: sete %al
+; CHECK-NEXT: testl %esi, %edi
+; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
%yy = shl i32 4, %y
%zz = lshr i32 2147483648, %z
@@ -415,9 +414,8 @@ define i1 @pow2_smin_fail0(i32 %x, i32 %y) {
; CHECK-NEXT: cmpl $262144, %eax # imm = 0x40000
; CHECK-NEXT: movl $262144, %ecx # imm = 0x40000
; CHECK-NEXT: cmovll %eax, %ecx
-; CHECK-NEXT: notl %edi
-; CHECK-NEXT: testl %edi, %ecx
-; CHECK-NEXT: sete %al
+; CHECK-NEXT: testl %ecx, %edi
+; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
%yy = shl i32 4, %y
%d = call i32 @llvm.smin.i32(i32 %yy, i32 262144)
@@ -506,9 +504,8 @@ define i1 @pow2_smax_fail1(i32 %x, i32 %y, i32 %z) {
; CHECK-NEXT: shrl %cl, %esi
; CHECK-NEXT: cmpl %esi, %eax
; CHECK-NEXT: cmovgl %eax, %esi
-; CHECK-NEXT: notl %edi
-; CHECK-NEXT: testl %edi, %esi
-; CHECK-NEXT: sete %al
+; CHECK-NEXT: testl %esi, %edi
+; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
%yy = shl i32 4, %y
%zz = lshr i32 2147483648, %z
@@ -580,9 +577,8 @@ define i1 @pow2_select_fail2(i1 %c, i32 %x, i32 %y, i32 %z) {
; CHECK-NEXT: shrl %cl, %r8d
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: cmovnel %edx, %r8d
-; CHECK-NEXT: notl %esi
-; CHECK-NEXT: testl %esi, %r8d
-; CHECK-NEXT: sete %al
+; CHECK-NEXT: testl %r8d, %esi
+; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
%yy = shl i32 4, %y
%zz = lshr i32 2147483648, %z
@@ -743,10 +739,9 @@ define <4 x i1> @pow2_vselect_fail2_ne(<4 x i1> %c, <4 x i32> %x, <4 x i32> %y,
; CHECK-NEXT: pand %xmm0, %xmm2
; CHECK-NEXT: pandn %xmm7, %xmm0
; CHECK-NEXT: por %xmm2, %xmm0
-; CHECK-NEXT: pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT: pand %xmm0, %xmm1
+; CHECK-NEXT: pand %xmm1, %xmm0
+; CHECK-NEXT: pxor %xmm1, %xmm1
; CHECK-NEXT: pcmpeqd %xmm1, %xmm0
-; CHECK-NEXT: pxor %xmm2, %xmm0
; CHECK-NEXT: retq
%yy = shl <4 x i32> <i32 4, i32 4, i32 4, i32 4>, %y
%zz = lshr <4 x i32> <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>, %z
@@ -787,9 +782,8 @@ define i1 @pow2_and_fail0(i32 %x, i32 %y) {
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: negl %ecx
; CHECK-NEXT: andl %eax, %ecx
-; CHECK-NEXT: notl %edi
-; CHECK-NEXT: testl %edi, %ecx
-; CHECK-NEXT: sete %al
+; CHECK-NEXT: testl %ecx, %edi
+; CHECK-NEXT: setne %al
; CHECK-NEXT: retq
%yy = shl i32 4, %y
%nyy = sub i32 0, %yy
More information about the llvm-commits
mailing list