[llvm] [SDAG] Handle insert_subvector in isKnownNeverNaN (PR #131989)
Jim Lin via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 21 06:25:02 PDT 2025
https://github.com/tclin914 updated https://github.com/llvm/llvm-project/pull/131989
>From 7e13154d68088443d196963f4d0fe3aed92f3018 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Wed, 19 Mar 2025 14:45:30 +0800
Subject: [PATCH 1/7] [SDAG] Pre-commit
---
.../RISCV/rvv/fixed-vectors-fmaximum.ll | 51 +++++++++++++++++++
.../RISCV/rvv/fixed-vectors-fminimum.ll | 51 +++++++++++++++++++
2 files changed, 102 insertions(+)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
index 04e73ac1ea956..ef81f0d18c8cc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
@@ -357,3 +357,54 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
%v = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %c)
ret <2 x half> %v
}
+
+declare <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half>, <2 x half>, i64)
+
+define <4 x half> @vfmax_v2f16_vv_nnan_insert_subvector(<2 x half> %a, <2 x half> %b, <4 x half> %c) {
+; ZVFH-LABEL: vfmax_v2f16_vv_nnan_insert_subvector:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT: vfadd.vv v8, v8, v8
+; ZVFH-NEXT: vfadd.vv v9, v9, v9
+; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT: vslideup.vi v8, v9, 2
+; ZVFH-NEXT: vmfeq.vv v0, v8, v8
+; ZVFH-NEXT: vmerge.vvm v9, v8, v10, v0
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
+; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT: vfmax.vv v8, v8, v9
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfmax_v2f16_vv_nnan_insert_subvector:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v11, v11
+; ZVFHMIN-NEXT: vfadd.vv v8, v8, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v9
+; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vslideup.vi v11, v9, 2
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v8, v0
+; ZVFHMIN-NEXT: vfmax.vv v9, v8, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+ %d = fadd nnan <2 x half> %a, %a
+ %e = fadd nnan <2 x half> %b, %b
+ %f = call <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half> undef, <2 x half> %d, i64 0)
+ %g = call <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half> %f, <2 x half> %e, i64 2)
+ %v = call <4 x half> @llvm.maximum.v4f16(<4 x half> %g, <4 x half> %c)
+ ret <4 x half> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
index a0334a9a5d20a..8504c51901a77 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
@@ -357,3 +357,54 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) {
%v = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %c)
ret <2 x half> %v
}
+
+declare <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half>, <2 x half>, i64)
+
+define <4 x half> @vfmin_v2f16_vv_nnan_insert_subvector(<2 x half> %a, <2 x half> %b, <4 x half> %c) {
+; ZVFH-LABEL: vfmin_v2f16_vv_nnan_insert_subvector:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT: vfadd.vv v8, v8, v8
+; ZVFH-NEXT: vfadd.vv v9, v9, v9
+; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT: vslideup.vi v8, v9, 2
+; ZVFH-NEXT: vmfeq.vv v0, v8, v8
+; ZVFH-NEXT: vmerge.vvm v9, v8, v10, v0
+; ZVFH-NEXT: vmfeq.vv v0, v10, v10
+; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT: vfmin.vv v8, v8, v9
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: vfmin_v2f16_vv_nnan_insert_subvector:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vfadd.vv v9, v11, v11
+; ZVFHMIN-NEXT: vfadd.vv v8, v8, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v11, v9
+; ZVFHMIN-NEXT: vfncvt.f.f.w v9, v8
+; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vslideup.vi v11, v9, 2
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v11
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmerge.vvm v10, v8, v9, v0
+; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9
+; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v8, v0
+; ZVFHMIN-NEXT: vfmin.vv v9, v8, v10
+; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT: ret
+ %d = fadd nnan <2 x half> %a, %a
+ %e = fadd nnan <2 x half> %b, %b
+ %f = call <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half> undef, <2 x half> %d, i64 0)
+ %g = call <4 x half> @llvm.vector.insert.v2f32.v4f32(<4 x half> %f, <2 x half> %e, i64 2)
+ %v = call <4 x half> @llvm.minimum.v4f16(<4 x half> %g, <4 x half> %c)
+ ret <4 x half> %v
+}
>From 68466618ff6aad1982aa8b3f9d53e0017b51a504 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Wed, 19 Mar 2025 14:30:55 +0800
Subject: [PATCH 2/7] [SDAG] Handle insert_subvector in isKnownNeverNaN
Propagate nnan across insert_subvector.
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 6 ++
llvm/test/CodeGen/AMDGPU/clamp.ll | 34 ++++++------
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 55 -------------------
.../RISCV/rvv/fixed-vectors-fmaximum.ll | 6 +-
.../RISCV/rvv/fixed-vectors-fminimum.ll | 6 +-
5 files changed, 27 insertions(+), 80 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f502a536d43b1..f75a08c514f4b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5632,6 +5632,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
(SNaN && !C->getValueAPF().isSignaling());
}
+ if (Op.isUndef())
+ return true;
+
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
case ISD::FADD:
@@ -5752,6 +5755,9 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
}
return isKnownNeverNaN(Src, SNaN, Depth + 1);
}
+ case ISD::INSERT_SUBVECTOR:
+ return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
+ isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
case ISD::BUILD_VECTOR: {
unsigned NumElts = Op.getNumOperands();
for (unsigned I = 0; I != NumElts; ++I)
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 17c84d7371de1..a08228fc919b8 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -2986,14 +2986,14 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
-; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_max_f32_e32 v3, 0, v3
+; GFX6-NEXT: v_max_f32_e32 v2, s0, v2
+; GFX6-NEXT: v_min_f32_e32 v3, s0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3006,20 +3006,20 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
-; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3
+; GFX8-NEXT: v_max_f16_e32 v3, s0, v3
; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -3747,16 +3747,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
+; GFX6-NEXT: v_max_f32_e32 v3, s0, v3
+; GFX6-NEXT: v_max_f32_e32 v2, 0, v2
; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
+; GFX6-NEXT: v_min_f32_e32 v2, s0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
@@ -3779,9 +3779,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v2, 0x7e00, v2
+; GFX8-NEXT: v_max_f16_e32 v2, s0, v2
; GFX8-NEXT: v_max_f16_e32 v3, 0, v3
-; GFX8-NEXT: v_min_f16_e32 v3, 0x7e00, v3
+; GFX8-NEXT: v_min_f16_e32 v3, s0, v3
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -3845,14 +3845,14 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
-; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_max_f32_e32 v3, 0, v3
+; GFX6-NEXT: v_max_f32_e32 v2, s0, v2
+; GFX6-NEXT: v_min_f32_e32 v3, s0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3865,20 +3865,20 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
-; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3
+; GFX8-NEXT: v_max_f16_e32 v3, s0, v3
; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index ef325da272005..8ba215c864841 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -1057,13 +1057,6 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, 0
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v5, v6 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
@@ -1077,34 +1070,6 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3
-; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
-;
-; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
-; SDAG-GFX900: ; %bb.0:
-; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3
-; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
-; SDAG-GFX906: ; %bb.0:
-; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3
-; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-VI: ; %bb.0:
@@ -1178,26 +1143,6 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v6
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
-; GISEL-GFX900: ; %bb.0:
-; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
-;
-; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
-; GISEL-GFX906: ; %bb.0:
-; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GISEL-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
-;
; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
index ef81f0d18c8cc..c6cd366497218 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll
@@ -367,12 +367,10 @@ define <4 x half> @vfmax_v2f16_vv_nnan_insert_subvector(<2 x half> %a, <2 x half
; ZVFH-NEXT: vfadd.vv v8, v8, v8
; ZVFH-NEXT: vfadd.vv v9, v9, v9
; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFH-NEXT: vslideup.vi v8, v9, 2
-; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmerge.vvm v9, v8, v10, v0
; ZVFH-NEXT: vmfeq.vv v0, v10, v10
+; ZVFH-NEXT: vslideup.vi v8, v9, 2
; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmax.vv v8, v8, v9
+; ZVFH-NEXT: vfmax.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmax_v2f16_vv_nnan_insert_subvector:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
index 8504c51901a77..568923db83591 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll
@@ -367,12 +367,10 @@ define <4 x half> @vfmin_v2f16_vv_nnan_insert_subvector(<2 x half> %a, <2 x half
; ZVFH-NEXT: vfadd.vv v8, v8, v8
; ZVFH-NEXT: vfadd.vv v9, v9, v9
; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; ZVFH-NEXT: vslideup.vi v8, v9, 2
-; ZVFH-NEXT: vmfeq.vv v0, v8, v8
-; ZVFH-NEXT: vmerge.vvm v9, v8, v10, v0
; ZVFH-NEXT: vmfeq.vv v0, v10, v10
+; ZVFH-NEXT: vslideup.vi v8, v9, 2
; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0
-; ZVFH-NEXT: vfmin.vv v8, v8, v9
+; ZVFH-NEXT: vfmin.vv v8, v8, v10
; ZVFH-NEXT: ret
;
; ZVFHMIN-LABEL: vfmin_v2f16_vv_nnan_insert_subvector:
>From 620794bf3e104ede8b368e23cbc86e30b25158b4 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Mon, 21 Apr 2025 10:29:27 +0800
Subject: [PATCH 3/7] Adjust DemandedElts
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 33 +++++++++--
llvm/test/CodeGen/AMDGPU/clamp.ll | 34 ++++++------
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 55 +++++++++++++++++++
3 files changed, 99 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f75a08c514f4b..71aef0a982497 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5632,9 +5632,6 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
(SNaN && !C->getValueAPF().isSignaling());
}
- if (Op.isUndef())
- return true;
-
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
case ISD::FADD:
@@ -5755,9 +5752,33 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
}
return isKnownNeverNaN(Src, SNaN, Depth + 1);
}
- case ISD::INSERT_SUBVECTOR:
- return isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
- isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
+ case ISD::INSERT_SUBVECTOR: {
+ SDValue BaseVector = Op.getOperand(0);
+ SDValue SubVector = Op.getOperand(1);
+ EVT BaseVectorVT = BaseVector.getValueType();
+ if (BaseVectorVT.isFixedLengthVector()) {
+ unsigned Idx = Op.getConstantOperandVal(2);
+ unsigned NumBaseVectorElts = BaseVectorVT.getVectorNumElements();
+ unsigned NumSubVectorElts =
+ SubVector.getValueType().getVectorNumElements();
+
+ // Clear the bits at the position where the subvector will be inserted.
+ APInt DemandedMask = APInt::getAllOnes(NumSubVectorElts)
+ .zext(NumBaseVectorElts)
+ .shl(Idx)
+ .reverseBits();
+ APInt DemandedSrcElts = DemandedElts & DemandedMask;
+
+ // If DemandedSrcElts is zero, we only need to check that the subvector is
+ // never NaN.
+ if (DemandedSrcElts.isZero())
+ return isKnownNeverNaN(SubVector, SNaN, Depth + 1);
+ return isKnownNeverNaN(BaseVector, DemandedSrcElts, SNaN, Depth + 1) &&
+ isKnownNeverNaN(SubVector, SNaN, Depth + 1);
+ }
+ return isKnownNeverNaN(BaseVector, SNaN, Depth + 1) &&
+ isKnownNeverNaN(SubVector, SNaN, Depth + 1);
+ }
case ISD::BUILD_VECTOR: {
unsigned NumElts = Op.getNumOperands();
for (unsigned I = 0; I != NumElts; ++I)
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index a08228fc919b8..17c84d7371de1 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -2986,14 +2986,14 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_max_f32_e32 v3, 0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, s0, v2
-; GFX6-NEXT: v_min_f32_e32 v3, s0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
+; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3006,20 +3006,20 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
-; GFX8-NEXT: v_max_f16_e32 v3, s0, v3
+; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3
; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
@@ -3747,16 +3747,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_max_f32_e32 v3, s0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, 0, v2
+; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_min_f32_e32 v2, s0, v2
+; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
@@ -3779,9 +3779,9 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
-; GFX8-NEXT: v_max_f16_e32 v2, s0, v2
+; GFX8-NEXT: v_max_f16_e32 v2, 0x7e00, v2
; GFX8-NEXT: v_max_f16_e32 v3, 0, v3
-; GFX8-NEXT: v_min_f16_e32 v3, s0, v3
+; GFX8-NEXT: v_min_f16_e32 v3, 0x7e00, v3
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -3845,14 +3845,14 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_max_f32_e32 v3, 0, v3
-; GFX6-NEXT: v_max_f32_e32 v2, s0, v2
-; GFX6-NEXT: v_min_f32_e32 v3, s0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
+; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3865,20 +3865,20 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX8-NEXT: v_max_f16_e32 v3, v3, v3
; GFX8-NEXT: v_max_f16_e32 v2, 0, v2
-; GFX8-NEXT: v_max_f16_e32 v3, s0, v3
+; GFX8-NEXT: v_max_f16_e32 v3, 0x7e00, v3
; GFX8-NEXT: v_min_f16_e32 v3, 1.0, v3
; GFX8-NEXT: v_min_f16_sdwa v2, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 8ba215c864841..ef325da272005 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -1057,6 +1057,13 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX1100-TRUE16: ; %bb.0:
; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
+; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, 0
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v5, v6 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
@@ -1070,6 +1077,34 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT: v_pack_b32_f16 v1, v1, 0
+; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-FAKE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; SDAG-GFX900: ; %bb.0:
+; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; SDAG-GFX906: ; %bb.0:
+; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-VI: ; %bb.0:
@@ -1143,6 +1178,26 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v6
; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; GISEL-GFX900: ; %bb.0:
+; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; GISEL-GFX906: ; %bb.0:
+; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
>From b74ac567b7a61a8fdd35ab7d8d5c4a1f5cebde1d Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Mon, 21 Apr 2025 17:17:56 +0800
Subject: [PATCH 4/7] Address RKSimon's comments
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 36 ++++++++++---------
1 file changed, 19 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 71aef0a982497..49110aee2b205 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5758,23 +5758,25 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
EVT BaseVectorVT = BaseVector.getValueType();
if (BaseVectorVT.isFixedLengthVector()) {
unsigned Idx = Op.getConstantOperandVal(2);
- unsigned NumBaseVectorElts = BaseVectorVT.getVectorNumElements();
- unsigned NumSubVectorElts =
- SubVector.getValueType().getVectorNumElements();
-
- // Clear the bits at the position where the subvector will be inserted.
- APInt DemandedMask = APInt::getAllOnes(NumSubVectorElts)
- .zext(NumBaseVectorElts)
- .shl(Idx)
- .reverseBits();
- APInt DemandedSrcElts = DemandedElts & DemandedMask;
-
- // If DemandedSrcElts is zero, we only need to check that the subvector is
- // never NaN.
- if (DemandedSrcElts.isZero())
- return isKnownNeverNaN(SubVector, SNaN, Depth + 1);
- return isKnownNeverNaN(BaseVector, DemandedSrcElts, SNaN, Depth + 1) &&
- isKnownNeverNaN(SubVector, SNaN, Depth + 1);
+ unsigned NumBaseElts = BaseVectorVT.getVectorNumElements();
+ unsigned NumSubElts = SubVector.getValueType().getVectorNumElements();
+
+ // Clear/Extract the bits at the position where the subvector will be
+ // inserted.
+ APInt DemandedMask =
+ APInt::getBitsSet(NumBaseElts, Idx, Idx + NumSubElts);
+ APInt DemandedSrcElts = DemandedElts & ~DemandedMask;
+ APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+
+ if (!DemandedSrcElts.isZero() && !DemandedSubElts.isZero())
+ return isKnownNeverNaN(BaseVector, DemandedSrcElts, SNaN, Depth + 1) &&
+ isKnownNeverNaN(SubVector, DemandedSubElts, SNaN, Depth + 1);
+ else if (!DemandedSrcElts.isZero())
+ return isKnownNeverNaN(BaseVector, DemandedSrcElts, SNaN, Depth + 1);
+ else if (!DemandedSubElts.isZero())
+ return isKnownNeverNaN(SubVector, DemandedSubElts, SNaN, Depth + 1);
+ else
+ return true;
}
return isKnownNeverNaN(BaseVector, SNaN, Depth + 1) &&
isKnownNeverNaN(SubVector, SNaN, Depth + 1);
>From 89759f479d4d217cb2410e88e4a420f58eeaf30b Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Mon, 21 Apr 2025 18:06:29 +0800
Subject: [PATCH 5/7] Add braces and remove else
---
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 49110aee2b205..39896b1694976 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5768,15 +5768,14 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
APInt DemandedSrcElts = DemandedElts & ~DemandedMask;
APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
- if (!DemandedSrcElts.isZero() && !DemandedSubElts.isZero())
+ if (!DemandedSrcElts.isZero() && !DemandedSubElts.isZero()) {
return isKnownNeverNaN(BaseVector, DemandedSrcElts, SNaN, Depth + 1) &&
isKnownNeverNaN(SubVector, DemandedSubElts, SNaN, Depth + 1);
- else if (!DemandedSrcElts.isZero())
+ } else if (!DemandedSrcElts.isZero())
return isKnownNeverNaN(BaseVector, DemandedSrcElts, SNaN, Depth + 1);
else if (!DemandedSubElts.isZero())
return isKnownNeverNaN(SubVector, DemandedSubElts, SNaN, Depth + 1);
- else
- return true;
+ return true;
}
return isKnownNeverNaN(BaseVector, SNaN, Depth + 1) &&
isKnownNeverNaN(SubVector, SNaN, Depth + 1);
>From d6ffb12eaa9d3869b79a15b570e784035bd88fc1 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Mon, 21 Apr 2025 18:22:30 +0800
Subject: [PATCH 6/7] Remove else
---
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 39896b1694976..e1b9f267bb524 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5771,9 +5771,10 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
if (!DemandedSrcElts.isZero() && !DemandedSubElts.isZero()) {
return isKnownNeverNaN(BaseVector, DemandedSrcElts, SNaN, Depth + 1) &&
isKnownNeverNaN(SubVector, DemandedSubElts, SNaN, Depth + 1);
- } else if (!DemandedSrcElts.isZero())
+ }
+ if (!DemandedSrcElts.isZero())
return isKnownNeverNaN(BaseVector, DemandedSrcElts, SNaN, Depth + 1);
- else if (!DemandedSubElts.isZero())
+ if (!DemandedSubElts.isZero())
return isKnownNeverNaN(SubVector, DemandedSubElts, SNaN, Depth + 1);
return true;
}
>From 58854e62db604ea23dbddc048d2b1c6a1548fe58 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Mon, 21 Apr 2025 21:03:00 +0800
Subject: [PATCH 7/7] Refine the style to avoid if-else chain.
---
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index e1b9f267bb524..09d73633462b6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5768,15 +5768,14 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
APInt DemandedSrcElts = DemandedElts & ~DemandedMask;
APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
- if (!DemandedSrcElts.isZero() && !DemandedSubElts.isZero()) {
- return isKnownNeverNaN(BaseVector, DemandedSrcElts, SNaN, Depth + 1) &&
- isKnownNeverNaN(SubVector, DemandedSubElts, SNaN, Depth + 1);
- }
+ bool NeverNaN = true;
if (!DemandedSrcElts.isZero())
- return isKnownNeverNaN(BaseVector, DemandedSrcElts, SNaN, Depth + 1);
- if (!DemandedSubElts.isZero())
- return isKnownNeverNaN(SubVector, DemandedSubElts, SNaN, Depth + 1);
- return true;
+ NeverNaN &=
+ isKnownNeverNaN(BaseVector, DemandedSrcElts, SNaN, Depth + 1);
+ if (NeverNaN && !DemandedSubElts.isZero())
+ NeverNaN &=
+ isKnownNeverNaN(SubVector, DemandedSubElts, SNaN, Depth + 1);
+ return NeverNaN;
}
return isKnownNeverNaN(BaseVector, SNaN, Depth + 1) &&
isKnownNeverNaN(SubVector, SNaN, Depth + 1);
More information about the llvm-commits
mailing list