[llvm] AMDGPU: Add baseline tests for fcopysign with 0 magnitude (PR #172698)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 17 09:31:41 PST 2025
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/172698
None
>From cef6dbfb42bae8559c23fcda8fc3d6ca3546302e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 17 Dec 2025 13:57:04 +0100
Subject: [PATCH] AMDGPU: Add baseline tests for fcopysign with 0 magnitude
---
llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 873 +++++++++++++++++++++
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 692 ++++++++++++++++
llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 129 +++
llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll | 131 ++++
4 files changed, 1825 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 4ff8bf23638f1..1fb87bf2c80a0 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -8051,4 +8051,877 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64(<4 x bfloat> %m
ret <4 x bfloat> %out
}
+define amdgpu_ps i32 @s_copysign_bf16_0_bf16(bfloat inreg %sign) {
+; GCN-LABEL: s_copysign_bf16_0_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_bf16_0_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_bf16_0_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_bf16_0_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_bf16_0_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_bf16_0_bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX11-NEXT: ; return to shader part epilog
+ %op = call bfloat @llvm.copysign.bf16(bfloat 0.0, bfloat %sign)
+ %cast = bitcast bfloat %op to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+define bfloat @v_copysign_bf16_0_bf16(bfloat %sign) {
+; GCN-LABEL: v_copysign_bf16_0_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_0_bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_0_bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_0_bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_0_bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_copysign_bf16_0_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_copysign_bf16_0_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %op = call bfloat @llvm.copysign.bf16(bfloat 0.0, bfloat %sign)
+ ret bfloat %op
+}
+
+define amdgpu_ps i32 @s_copysign_bf16_0_f32(float inreg %sign) {
+; GCN-LABEL: s_copysign_bf16_0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_bf16_0_f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_bf16_0_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_bfe_u32 s1, s0, 0x10010
+; GFX8-NEXT: s_add_i32 s1, s1, s0
+; GFX8-NEXT: s_addk_i32 s1, 0x7fff
+; GFX8-NEXT: v_cmp_u_f32_e64 s[2:3], s0, s0
+; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX8-NEXT: s_cselect_b32 s0, s0, s1
+; GFX8-NEXT: s_lshr_b32 s0, s0, 16
+; GFX8-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_bf16_0_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_bfe_u32 s1, s0, 0x10010
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_addk_i32 s1, 0x7fff
+; GFX9-NEXT: v_cmp_u_f32_e64 s[2:3], s0, s0
+; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT: s_cselect_b32 s0, s0, s1
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_bf16_0_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_bfe_u32 s1, s0, 0x10010
+; GFX10-NEXT: v_cmp_u_f32_e64 s2, s0, s0
+; GFX10-NEXT: s_add_i32 s1, s1, s0
+; GFX10-NEXT: s_addk_i32 s1, 0x7fff
+; GFX10-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX10-NEXT: s_cselect_b32 s0, s0, s1
+; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_bf16_0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_bfe_u32 s1, s0, 0x10010
+; GFX11-NEXT: v_cmp_u_f32_e64 s2, s0, s0
+; GFX11-NEXT: s_add_i32 s1, s1, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_addk_i32 s1, 0x7fff
+; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT: s_cselect_b32 s0, s0, s1
+; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX11-NEXT: ; return to shader part epilog
+ %sign.trunc = fptrunc float %sign to bfloat
+ %op = call bfloat @llvm.copysign.bf16(bfloat 0.0, bfloat %sign.trunc)
+ %cast = bitcast bfloat %op to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+define bfloat @v_copysign_bf16_0_f32(float %sign) {
+; GCN-LABEL: v_copysign_bf16_0_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_0_f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_0_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_0_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_0_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_copysign_bf16_0_f32:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_copysign_bf16_0_f32:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %sign.trunc = fptrunc float %sign to bfloat
+ %op = call bfloat @llvm.copysign.bf16(bfloat 0.0, bfloat %sign.trunc)
+ ret bfloat %op
+}
+
+define amdgpu_ps i32 @s_copysign_bf16_0_f64(double inreg %sign) {
+; GCN-LABEL: s_copysign_bf16_0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_bf16_0_f64:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_bf16_0_f64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_cvt_f32_f64_e32 v2, s[0:1]
+; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
+; GFX8-NEXT: v_readfirstlane_b32 s6, v2
+; GFX8-NEXT: s_bitcmp1_b32 s6, 0
+; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_cmp_gt_f64_e64 s[2:3], |s[0:1]|, |v[0:1]|
+; GFX8-NEXT: v_cmp_u_f64_e64 s[0:1], s[0:1], s[0:1]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX8-NEXT: s_cselect_b32 s2, 1, -1
+; GFX8-NEXT: s_add_i32 s7, s6, s2
+; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], exec
+; GFX8-NEXT: s_cselect_b32 s2, s6, s7
+; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010
+; GFX8-NEXT: s_add_i32 s3, s3, s2
+; GFX8-NEXT: s_addk_i32 s3, 0x7fff
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX8-NEXT: s_cselect_b32 s0, s2, s3
+; GFX8-NEXT: s_lshr_b32 s0, s0, 16
+; GFX8-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_bf16_0_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_cvt_f32_f64_e32 v2, s[0:1]
+; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
+; GFX9-NEXT: v_readfirstlane_b32 s6, v2
+; GFX9-NEXT: s_bitcmp1_b32 s6, 0
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_cmp_gt_f64_e64 s[2:3], |s[0:1]|, |v[0:1]|
+; GFX9-NEXT: v_cmp_u_f64_e64 s[0:1], s[0:1], s[0:1]
+; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT: s_cselect_b32 s2, 1, -1
+; GFX9-NEXT: s_add_i32 s7, s6, s2
+; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], exec
+; GFX9-NEXT: s_cselect_b32 s2, s6, s7
+; GFX9-NEXT: s_bfe_u32 s3, s2, 0x10010
+; GFX9-NEXT: s_add_i32 s3, s3, s2
+; GFX9-NEXT: s_addk_i32 s3, 0x7fff
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s2, s3
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_bf16_0_f64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_f32_f64_e32 v2, s[0:1]
+; GFX10-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
+; GFX10-NEXT: v_readfirstlane_b32 s3, v2
+; GFX10-NEXT: s_bitcmp1_b32 s3, 0
+; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX10-NEXT: v_cmp_gt_f64_e64 s2, |s[0:1]|, |v[0:1]|
+; GFX10-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[0:1]
+; GFX10-NEXT: s_cselect_b32 s1, -1, 0
+; GFX10-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX10-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX10-NEXT: s_cselect_b32 s2, 1, -1
+; GFX10-NEXT: s_add_i32 s2, s3, s2
+; GFX10-NEXT: s_and_b32 s1, s1, exec_lo
+; GFX10-NEXT: s_cselect_b32 s1, s3, s2
+; GFX10-NEXT: s_bfe_u32 s2, s1, 0x10010
+; GFX10-NEXT: s_add_i32 s2, s2, s1
+; GFX10-NEXT: s_addk_i32 s2, 0x7fff
+; GFX10-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX10-NEXT: s_cselect_b32 s0, s1, s2
+; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_bf16_0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_f32_f64_e32 v2, s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
+; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX11-NEXT: v_cmp_gt_f64_e64 s2, |s[0:1]|, |v[0:1]|
+; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[0:1]
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
+; GFX11-NEXT: s_bitcmp1_b32 s1, 0
+; GFX11-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT: s_cselect_b32 s2, 1, -1
+; GFX11-NEXT: s_add_i32 s2, s1, s2
+; GFX11-NEXT: s_and_b32 s3, s3, exec_lo
+; GFX11-NEXT: s_cselect_b32 s1, s1, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_bfe_u32 s2, s1, 0x10010
+; GFX11-NEXT: s_add_i32 s2, s2, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_addk_i32 s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT: s_cselect_b32 s0, s1, s2
+; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX11-NEXT: ; return to shader part epilog
+ %sign.trunc = fptrunc double %sign to bfloat
+ %op = call bfloat @llvm.copysign.bf16(bfloat 0.0, bfloat %sign.trunc)
+ %cast = bitcast bfloat %op to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+define bfloat @v_copysign_bf16_0_f64(double %sign) {
+; GCN-LABEL: v_copysign_bf16_0_f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_bf16_0_f64:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_bf16_0_f64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_bf16_0_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_bf16_0_f64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_copysign_bf16_0_f64:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_copysign_bf16_0_f64:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %sign.trunc = fptrunc double %sign to bfloat
+ %op = call bfloat @llvm.copysign.bf16(bfloat 0.0, bfloat %sign.trunc)
+ ret bfloat %op
+}
+
+define amdgpu_ps i32 @s_copysign_v2bf16_0_v2bf16(<2 x bfloat> inreg %sign) {
+; GCN-LABEL: s_copysign_v2bf16_0_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
+; GCN-NEXT: v_and_b32_e32 v0, 0x80008000, v0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_v2bf16_0_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GFX7-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0x80008000, v0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_v2bf16_0_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_bfi_b32 v0, s1, 0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_v2bf16_0_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_bfi_b32 v0, s1, 0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_v2bf16_0_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_v2bf16_0_v2bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign)
+ %cast = bitcast <2 x bfloat> %op to i32
+ ret i32 %cast
+}
+
+define <2 x bfloat> @v_copysign_v2bf16_0_v2bf16(<2 x bfloat> %sign) {
+; GCN-LABEL: v_copysign_v2bf16_0_v2bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_v2bf16_0_v2bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_v2bf16_0_v2bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
+; GFX8-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_v2bf16_0_v2bf16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_v2bf16_0_v2bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_v2bf16_0_v2bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign)
+ ret <2 x bfloat> %op
+}
+
+define amdgpu_ps i32 @s_copysign_v2bf16_0_v2f32(<2 x float> inreg %sign) {
+; GCN-LABEL: s_copysign_v2bf16_0_v2f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
+; GCN-NEXT: v_and_b32_e32 v0, 0x80008000, v0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_v2bf16_0_v2f32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GFX7-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
+; GFX7-NEXT: v_and_b32_e32 v0, 0x80008000, v0
+; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_v2bf16_0_v2f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_bfe_u32 s2, s0, 0x10010
+; GFX8-NEXT: s_add_i32 s2, s2, s0
+; GFX8-NEXT: s_add_i32 s4, s2, 0x7fff
+; GFX8-NEXT: v_cmp_u_f32_e64 s[2:3], s0, s0
+; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX8-NEXT: s_cselect_b32 s0, s0, s4
+; GFX8-NEXT: s_bfe_u32 s2, s1, 0x10010
+; GFX8-NEXT: s_add_i32 s2, s2, s1
+; GFX8-NEXT: s_add_i32 s4, s2, 0x7fff
+; GFX8-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1
+; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX8-NEXT: s_cselect_b32 s1, s1, s4
+; GFX8-NEXT: s_lshr_b32 s1, s1, 16
+; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
+; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_bfi_b32 v0, s1, 0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_v2bf16_0_v2f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_bfe_u32 s2, s1, 0x10010
+; GFX9-NEXT: s_add_i32 s2, s2, s1
+; GFX9-NEXT: s_or_b32 s4, s1, 0x400000
+; GFX9-NEXT: s_add_i32 s5, s2, 0x7fff
+; GFX9-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1
+; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT: s_cselect_b32 s1, s4, s5
+; GFX9-NEXT: s_lshr_b32 s2, s1, 16
+; GFX9-NEXT: s_bfe_u32 s1, s0, 0x10010
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_or_b32 s3, s0, 0x400000
+; GFX9-NEXT: s_add_i32 s4, s1, 0x7fff
+; GFX9-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s3, s4
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_bfi_b32 v0, s1, 0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_v2bf16_0_v2f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_bfe_u32 s2, s1, 0x10010
+; GFX10-NEXT: v_cmp_u_f32_e64 s3, s1, s1
+; GFX10-NEXT: s_add_i32 s2, s2, s1
+; GFX10-NEXT: s_bitset1_b32 s1, 22
+; GFX10-NEXT: s_addk_i32 s2, 0x7fff
+; GFX10-NEXT: s_and_b32 s3, s3, exec_lo
+; GFX10-NEXT: s_cselect_b32 s1, s1, s2
+; GFX10-NEXT: s_bfe_u32 s2, s0, 0x10010
+; GFX10-NEXT: v_cmp_u_f32_e64 s3, s0, s0
+; GFX10-NEXT: s_add_i32 s2, s2, s0
+; GFX10-NEXT: s_lshr_b32 s1, s1, 16
+; GFX10-NEXT: s_bitset1_b32 s0, 22
+; GFX10-NEXT: s_addk_i32 s2, 0x7fff
+; GFX10-NEXT: s_and_b32 s3, s3, exec_lo
+; GFX10-NEXT: s_cselect_b32 s0, s0, s2
+; GFX10-NEXT: s_lshr_b32 s0, s0, 16
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_v2bf16_0_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_bfe_u32 s2, s1, 0x10010
+; GFX11-NEXT: v_cmp_u_f32_e64 s3, s1, s1
+; GFX11-NEXT: s_add_i32 s2, s2, s1
+; GFX11-NEXT: s_bitset1_b32 s1, 22
+; GFX11-NEXT: s_addk_i32 s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s3, s3, exec_lo
+; GFX11-NEXT: s_cselect_b32 s1, s1, s2
+; GFX11-NEXT: s_bfe_u32 s2, s0, 0x10010
+; GFX11-NEXT: v_cmp_u_f32_e64 s3, s0, s0
+; GFX11-NEXT: s_add_i32 s2, s2, s0
+; GFX11-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-NEXT: s_bitset1_b32 s0, 22
+; GFX11-NEXT: s_addk_i32 s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s3, s3, exec_lo
+; GFX11-NEXT: s_cselect_b32 s0, s0, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %sign.trunc = fptrunc <2 x float> %sign to <2 x bfloat>
+ %op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign.trunc)
+ %cast = bitcast <2 x bfloat> %op to i32
+ ret i32 %cast
+}
+
+define <2 x bfloat> @v_copysign_v2bf16_0_v2bf32(<2 x float> %sign) {
+; GCN-LABEL: v_copysign_v2bf16_0_v2bf32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_v2bf16_0_v2bf32:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_v2bf16_0_v2bf32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX8-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
+; GFX8-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_v2bf16_0_v2bf32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4
+; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT: s_mov_b32 s4, 0x7060302
+; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_v2bf16_0_v2bf32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_copysign_v2bf16_0_v2bf32:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
+; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v1
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_copysign_v2bf16_0_v2bf32:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %sign.trunc = fptrunc <2 x float> %sign to <2 x bfloat>
+ %op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign.trunc)
+ ret <2 x bfloat> %op
+}
+
+define amdgpu_ps i32 @s_copysign_v2bf16_0_v2f64(<2 x double> inreg %sign) {
+; GCN-LABEL: s_copysign_v2bf16_0_v2f64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_and_b32 s0, s1, 0x80000000
+; GCN-NEXT: s_and_b32 s1, s3, 0x80000000
+; GCN-NEXT: s_lshr_b32 s1, s1, 16
+; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
+; GCN-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: s_copysign_v2bf16_0_v2f64:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000
+; GFX7-NEXT: s_and_b32 s1, s3, 0x80000000
+; GFX7-NEXT: s_lshr_b32 s1, s1, 16
+; GFX7-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_copysign_v2bf16_0_v2f64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshl_b32 s0, s3, 16
+; GFX8-NEXT: s_and_b32 s1, s1, 0x8000
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_bfi_b32 v0, s1, 0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_v2bf16_0_v2f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s3
+; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_bfi_b32 v0, s1, 0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_copysign_v2bf16_0_v2f64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_pack_ll_b32_b16 s0, s1, s3
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_v2bf16_0_v2f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %sign.trunc = fptrunc <2 x double> %sign to <2 x bfloat>
+ %op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign.trunc)
+ %cast = bitcast <2 x bfloat> %op to i32
+ ret i32 %cast
+}
+
+define <2 x bfloat> @v_copysign_v2bf16_0_v2bf64(<2 x double> %sign) {
+; GCN-LABEL: v_copysign_v2bf16_0_v2bf64:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v3
+; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_v2bf16_0_v2bf64:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_copysign_v2bf16_0_v2bf64:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v1, 0x8000, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
+; GFX8-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_v2bf16_0_v2bf64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v3, v1, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_copysign_v2bf16_0_v2bf64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_perm_b32 v0, v3, v1, 0x5040100
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11TRUE16-LABEL: v_copysign_v2bf16_0_v2bf64:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v1
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_copysign_v2bf16_0_v2bf64:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_perm_b32 v0, v3, v1, 0x5040100
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %sign.trunc = fptrunc <2 x double> %sign to <2 x bfloat>
+ %op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign.trunc)
+ ret <2 x bfloat> %op
+}
+
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index a773bf256bd0a..3e7b7f3878bd8 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -6890,4 +6890,696 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f64(<4 x half> %mag, <4
ret <4 x half> %out
}
+define amdgpu_ps i32 @s_copysign_f16_0_f16(half inreg %sign) {
+; SI-LABEL: s_copysign_f16_0_f16:
+; SI: ; %bb.0:
+; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: s_copysign_f16_0_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_and_b32 s0, s0, 0x8000
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_f16_0_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_f16_0_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX11-NEXT: ; return to shader part epilog
+ %op = call half @llvm.copysign.f16(half 0.0, half %sign)
+ %cast = bitcast half %op to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+define half @v_copysign_f16_0_f16(half %sign) {
+; SI-LABEL: v_copysign_f16_0_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_copysign_f16_0_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: s_movk_i32 s4, 0x7fff
+; VI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_f16_0_f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_copysign_f16_0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_copysign_f16_0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %op = call half @llvm.copysign.f16(half 0.0, half %sign)
+ ret half %op
+}
+
+define amdgpu_ps i32 @s_copysign_f16_0_f32(float inreg %sign) {
+; SI-LABEL: s_copysign_f16_0_f32:
+; SI: ; %bb.0:
+; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: s_copysign_f16_0_f32:
+; VI: ; %bb.0:
+; VI-NEXT: v_cvt_f16_f32_e32 v0, s0
+; VI-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; VI-NEXT: v_readfirstlane_b32 s0, v0
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_f16_0_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX9-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_copysign_f16_0_f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_copysign_f16_0_f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x8000, v0
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
+ %sign.trunc = fptrunc float %sign to half
+ %op = call half @llvm.copysign.f16(half 0.0, half %sign.trunc)
+ %cast = bitcast half %op to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+define half @v_copysign_f16_0_f32(float %sign) {
+; SI-LABEL: v_copysign_f16_0_f32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_copysign_f16_0_f32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; VI-NEXT: s_movk_i32 s4, 0x7fff
+; VI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_f16_0_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_copysign_f16_0_f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_copysign_f16_0_f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %sign.trunc = fptrunc float %sign to half
+ %op = call half @llvm.copysign.f16(half 0.0, half %sign.trunc)
+ ret half %op
+}
+
+define amdgpu_ps i32 @s_copysign_f16_0_f64(double inreg %sign) {
+; SI-LABEL: s_copysign_f16_0_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_and_b32 s2, s1, 0x1ff
+; SI-NEXT: s_or_b32 s0, s2, s0
+; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; SI-NEXT: s_lshr_b32 s0, s1, 8
+; SI-NEXT: s_bfe_u32 s3, s1, 0xb0014
+; SI-NEXT: s_and_b32 s0, s0, 0xffe
+; SI-NEXT: v_readfirstlane_b32 s2, v0
+; SI-NEXT: s_sub_i32 s4, 0x3f1, s3
+; SI-NEXT: s_or_b32 s0, s0, s2
+; SI-NEXT: v_med3_i32 v0, s4, 0, 13
+; SI-NEXT: s_or_b32 s2, s0, 0x1000
+; SI-NEXT: v_readfirstlane_b32 s4, v0
+; SI-NEXT: s_lshr_b32 s5, s2, s4
+; SI-NEXT: s_lshl_b32 s4, s5, s4
+; SI-NEXT: s_cmp_lg_u32 s4, s2
+; SI-NEXT: s_cselect_b32 s2, 1, 0
+; SI-NEXT: s_add_i32 s4, s3, 0xfffffc10
+; SI-NEXT: s_lshl_b32 s3, s4, 12
+; SI-NEXT: s_or_b32 s2, s5, s2
+; SI-NEXT: s_or_b32 s0, s0, s3
+; SI-NEXT: s_cmp_lt_i32 s4, 1
+; SI-NEXT: s_cselect_b32 s0, s2, s0
+; SI-NEXT: s_and_b32 s2, s0, 7
+; SI-NEXT: s_cmp_gt_i32 s2, 5
+; SI-NEXT: s_cselect_b32 s3, 1, 0
+; SI-NEXT: s_cmp_eq_u32 s2, 3
+; SI-NEXT: s_cselect_b32 s2, 1, 0
+; SI-NEXT: s_or_b32 s2, s2, s3
+; SI-NEXT: s_lshr_b32 s0, s0, 2
+; SI-NEXT: s_add_i32 s0, s0, s2
+; SI-NEXT: s_cmp_lt_i32 s4, 31
+; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT: s_cmpk_lg_i32 s4, 0x40f
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; SI-NEXT: s_cselect_b32 s0, s0, 0x7c00
+; SI-NEXT: s_lshr_b32 s1, s1, 16
+; SI-NEXT: s_or_b32 s0, s1, s0
+; SI-NEXT: s_and_b32 s0, s0, 0x8000
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: s_copysign_f16_0_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_and_b32 s2, s1, 0x1ff
+; VI-NEXT: s_or_b32 s0, s2, s0
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; VI-NEXT: s_lshr_b32 s0, s1, 8
+; VI-NEXT: s_bfe_u32 s3, s1, 0xb0014
+; VI-NEXT: s_and_b32 s0, s0, 0xffe
+; VI-NEXT: v_readfirstlane_b32 s2, v0
+; VI-NEXT: s_sub_i32 s4, 0x3f1, s3
+; VI-NEXT: s_or_b32 s0, s0, s2
+; VI-NEXT: v_med3_i32 v0, s4, 0, 13
+; VI-NEXT: s_or_b32 s2, s0, 0x1000
+; VI-NEXT: v_readfirstlane_b32 s4, v0
+; VI-NEXT: s_lshr_b32 s5, s2, s4
+; VI-NEXT: s_lshl_b32 s4, s5, s4
+; VI-NEXT: s_cmp_lg_u32 s4, s2
+; VI-NEXT: s_cselect_b32 s2, 1, 0
+; VI-NEXT: s_add_i32 s4, s3, 0xfffffc10
+; VI-NEXT: s_lshl_b32 s3, s4, 12
+; VI-NEXT: s_or_b32 s2, s5, s2
+; VI-NEXT: s_or_b32 s0, s0, s3
+; VI-NEXT: s_cmp_lt_i32 s4, 1
+; VI-NEXT: s_cselect_b32 s0, s2, s0
+; VI-NEXT: s_and_b32 s2, s0, 7
+; VI-NEXT: s_cmp_gt_i32 s2, 5
+; VI-NEXT: s_cselect_b32 s3, 1, 0
+; VI-NEXT: s_cmp_eq_u32 s2, 3
+; VI-NEXT: s_cselect_b32 s2, 1, 0
+; VI-NEXT: s_or_b32 s2, s2, s3
+; VI-NEXT: s_lshr_b32 s0, s0, 2
+; VI-NEXT: s_add_i32 s0, s0, s2
+; VI-NEXT: s_cmp_lt_i32 s4, 31
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_cmpk_lg_i32 s4, 0x40f
+; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; VI-NEXT: s_cselect_b32 s0, s0, 0x7c00
+; VI-NEXT: s_lshr_b32 s1, s1, 16
+; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_and_b32 s0, s0, 0x8000
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_f16_0_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff
+; GFX9-NEXT: s_or_b32 s0, s2, s0
+; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT: s_lshr_b32 s0, s1, 8
+; GFX9-NEXT: s_bfe_u32 s3, s1, 0xb0014
+; GFX9-NEXT: s_and_b32 s0, s0, 0xffe
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_sub_i32 s4, 0x3f1, s3
+; GFX9-NEXT: s_or_b32 s0, s0, s2
+; GFX9-NEXT: v_med3_i32 v0, s4, 0, 13
+; GFX9-NEXT: s_or_b32 s2, s0, 0x1000
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_lshr_b32 s5, s2, s4
+; GFX9-NEXT: s_lshl_b32 s4, s5, s4
+; GFX9-NEXT: s_cmp_lg_u32 s4, s2
+; GFX9-NEXT: s_cselect_b32 s2, 1, 0
+; GFX9-NEXT: s_add_i32 s4, s3, 0xfffffc10
+; GFX9-NEXT: s_lshl_b32 s3, s4, 12
+; GFX9-NEXT: s_or_b32 s2, s5, s2
+; GFX9-NEXT: s_or_b32 s0, s0, s3
+; GFX9-NEXT: s_cmp_lt_i32 s4, 1
+; GFX9-NEXT: s_cselect_b32 s0, s2, s0
+; GFX9-NEXT: s_and_b32 s2, s0, 7
+; GFX9-NEXT: s_cmp_gt_i32 s2, 5
+; GFX9-NEXT: s_cselect_b32 s3, 1, 0
+; GFX9-NEXT: s_cmp_eq_u32 s2, 3
+; GFX9-NEXT: s_cselect_b32 s2, 1, 0
+; GFX9-NEXT: s_or_b32 s2, s2, s3
+; GFX9-NEXT: s_lshr_b32 s0, s0, 2
+; GFX9-NEXT: s_add_i32 s0, s0, s2
+; GFX9-NEXT: s_cmp_lt_i32 s4, 31
+; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX9-NEXT: s_cmpk_lg_i32 s4, 0x40f
+; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT: s_cselect_b32 s0, s0, 0x7c00
+; GFX9-NEXT: s_lshr_b32 s1, s1, 16
+; GFX9-NEXT: s_or_b32 s0, s1, s0
+; GFX9-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_f16_0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_and_b32 s2, s1, 0x1ff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s2, s0
+; GFX11-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-NEXT: s_lshr_b32 s3, s1, 8
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_bfe_u32 s0, s1, 0xb0014
+; GFX11-NEXT: s_and_b32 s3, s3, 0xffe
+; GFX11-NEXT: s_sub_i32 s2, 0x3f1, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_med3_i32 v1, s2, 0, 13
+; GFX11-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s3, s2, 0x1000
+; GFX11-NEXT: s_lshr_b32 s5, s3, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s4, s5, s4
+; GFX11-NEXT: s_cmp_lg_u32 s4, s3
+; GFX11-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-NEXT: s_addk_i32 s0, 0xfc10
+; GFX11-NEXT: s_or_b32 s3, s5, s3
+; GFX11-NEXT: s_lshl_b32 s4, s0, 12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s2, s2, s4
+; GFX11-NEXT: s_cmp_lt_i32 s0, 1
+; GFX11-NEXT: s_cselect_b32 s2, s3, s2
+; GFX11-NEXT: s_and_b32 s3, s2, 7
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_cmp_gt_i32 s3, 5
+; GFX11-NEXT: s_cselect_b32 s4, 1, 0
+; GFX11-NEXT: s_cmp_eq_u32 s3, 3
+; GFX11-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-NEXT: s_lshr_b32 s2, s2, 2
+; GFX11-NEXT: s_or_b32 s3, s3, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s2, s2, s3
+; GFX11-NEXT: s_cmp_lt_i32 s0, 31
+; GFX11-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-NEXT: s_cmpk_lg_i32 s0, 0x40f
+; GFX11-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-NEXT: s_and_b32 s0, s0, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT: s_cselect_b32 s0, s2, 0x7c00
+; GFX11-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, s0, 0x8000
+; GFX11-NEXT: ; return to shader part epilog
+ %sign.trunc = fptrunc double %sign to half
+ %op = call half @llvm.copysign.f16(half 0.0, half %sign.trunc)
+ %cast = bitcast half %op to i16
+ %zext = zext i16 %cast to i32
+ ret i32 %zext
+}
+
+define half @v_copysign_f16_0_f64(double %sign) {
+; SI-LABEL: v_copysign_f16_0_f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_bfi_b32 v0, s4, 0, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_copysign_f16_0_f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; VI-NEXT: s_movk_i32 s4, 0x7fff
+; VI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_f16_0_f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_copysign_f16_0_f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_copysign_f16_0_f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %sign.trunc = fptrunc double %sign to half
+ %op = call half @llvm.copysign.f16(half 0.0, half %sign.trunc)
+ ret half %op
+}
+
+define amdgpu_ps i32 @s_copysign_v2f16_0_v2f16(<2 x half> inreg %sign) {
+; SI-LABEL: s_copysign_v2f16_0_v2f16:
+; SI: ; %bb.0:
+; SI-NEXT: v_cvt_f16_f32_e32 v0, s1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, s0
+; SI-NEXT: s_brev_b32 s0, -2
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_bfi_b32 v0, s0, 0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_bfi_b32 v1, s0, 0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: s_copysign_v2f16_0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_mov_b32 s1, 0x7fff7fff
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_bfi_b32 v0, s1, 0, v0
+; VI-NEXT: v_readfirstlane_b32 s0, v0
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_v2f16_0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_bfi_b32 v0, s1, 0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_v2f16_0_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign)
+ %cast = bitcast <2 x half> %op to i32
+ ret i32 %cast
+}
+
+define <2 x half> @v_copysign_v2f16_0_v2f16(<2 x half> %sign) {
+; SI-LABEL: v_copysign_v2f16_0_v2f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; SI-NEXT: v_bfi_b32 v1, s4, 0, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_copysign_v2f16_0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
+; VI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_v2f16_0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_v2f16_0_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign)
+ ret <2 x half> %op
+}
+
+define amdgpu_ps i32 @s_copysign_v2f16_0_v2f32(<2 x float> inreg %sign) {
+; SI-LABEL: s_copysign_v2f16_0_v2f32:
+; SI: ; %bb.0:
+; SI-NEXT: v_cvt_f16_f32_e32 v0, s1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, s0
+; SI-NEXT: s_brev_b32 s0, -2
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_bfi_b32 v0, s0, 0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_bfi_b32 v1, s0, 0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: s_copysign_v2f16_0_v2f32:
+; VI: ; %bb.0:
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_cvt_f16_f32_e32 v1, s0
+; VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT: s_mov_b32 s0, 0x7fff7fff
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: v_bfi_b32 v0, s0, 0, v0
+; VI-NEXT: v_readfirstlane_b32 s0, v0
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_v2f16_0_v2f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x7fff7fff
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_bfi_b32 v0, s0, 0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: s_copysign_v2f16_0_v2f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, s1
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-FAKE16-LABEL: s_copysign_v2f16_0_v2f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s1
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-FAKE16-NEXT: ; return to shader part epilog
+ %sign.trunc = fptrunc <2 x float> %sign to <2 x half>
+ %op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign.trunc)
+ %cast = bitcast <2 x half> %op to i32
+ ret i32 %cast
+}
+
+define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) {
+; SI-LABEL: v_copysign_v2f16_0_v2bf32:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; SI-NEXT: v_bfi_b32 v1, s4, 0, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_copysign_v2f16_0_v2bf32:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
+; VI-NEXT: v_or_b32_e32 v0, v0, v1
+; VI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_v2f16_0_v2bf32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_copysign_v2f16_0_v2bf32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v1
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_copysign_v2f16_0_v2bf32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %sign.trunc = fptrunc <2 x float> %sign to <2 x half>
+ %op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign.trunc)
+ ret <2 x half> %op
+}
+
+define amdgpu_ps i32 @s_copysign_v2f16_0_v2f64(<2 x double> inreg %sign) {
+; SI-LABEL: s_copysign_v2f16_0_v2f64:
+; SI: ; %bb.0:
+; SI-NEXT: s_brev_b32 s0, -2
+; SI-NEXT: v_mov_b32_e32 v0, s3
+; SI-NEXT: v_bfi_b32 v0, s0, 0, v0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_bfi_b32 v1, s0, 0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: s_copysign_v2f16_0_v2f64:
+; VI: ; %bb.0:
+; VI-NEXT: s_lshl_b32 s0, s3, 16
+; VI-NEXT: s_and_b32 s1, s1, 0x8000
+; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_mov_b32 s1, 0x7fff7fff
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_bfi_b32 v0, s1, 0, v0
+; VI-NEXT: v_readfirstlane_b32 s0, v0
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_copysign_v2f16_0_v2f64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s3
+; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_bfi_b32 v0, s1, 0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_v2f16_0_v2f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_pack_ll_b32_b16 s0, s1, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %sign.trunc = fptrunc <2 x double> %sign to <2 x half>
+ %op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign.trunc)
+ %cast = bitcast <2 x half> %op to i32
+ ret i32 %cast
+}
+
+define <2 x half> @v_copysign_v2f16_0_v2bf64(<2 x double> %sign) {
+; SI-LABEL: v_copysign_v2f16_0_v2bf64:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_bfi_b32 v0, s4, 0, v1
+; SI-NEXT: v_bfi_b32 v1, s4, 0, v3
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_copysign_v2f16_0_v2bf64:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; VI-NEXT: v_and_b32_e32 v1, 0x8000, v1
+; VI-NEXT: v_or_b32_e32 v0, v1, v0
+; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
+; VI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_copysign_v2f16_0_v2bf64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x5040100
+; GFX9-NEXT: v_perm_b32 v0, v3, v1, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-TRUE16-LABEL: v_copysign_v2f16_0_v2bf64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_copysign_v2f16_0_v2bf64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v3, v1, 0x5040100
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+ %sign.trunc = fptrunc <2 x double> %sign to <2 x half>
+ %op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign.trunc)
+ ret <2 x half> %op
+}
+
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 0a2e758f7cf21..83d57e4e6e48c 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -1107,6 +1107,131 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out,
ret void
}
+define amdgpu_ps i32 @s_copysign_f32_0_f32(float inreg %sign) {
+; SIVI-LABEL: s_copysign_f32_0_f32:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_and_b32 s0, s0, 0x80000000
+; SIVI-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_f32_0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_and_b32 s0, s0, 0x80000000
+; GFX11-NEXT: ; return to shader part epilog
+ %result = call float @llvm.copysign.f32(float 0.0, float %sign)
+ %cast = bitcast float %result to i32
+ ret i32 %cast
+}
+
+define float @v_copysign_f32_0_f32(float %sign) {
+; SIVI-LABEL: v_copysign_f32_0_f32:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: s_brev_b32 s4, -2
+; SIVI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_f32_0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.copysign.f32(float 0.0, float %sign)
+ ret float %result
+}
+
+define amdgpu_ps i32 @s_copysign_f32_0_f64(double inreg %sign) {
+; SIVI-LABEL: s_copysign_f32_0_f64:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; SIVI-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; SIVI-NEXT: v_readfirstlane_b32 s0, v0
+; SIVI-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_f32_0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %sign.trunc = fptrunc double %sign to float
+ %result = call float @llvm.copysign.f32(float 0.0, float %sign.trunc)
+ %cast = bitcast float %result to i32
+ ret i32 %cast
+}
+
+define float @v_copysign_f32_0_f64(double %sign) {
+; SIVI-LABEL: v_copysign_f32_0_f64:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: s_brev_b32 s4, -2
+; SIVI-NEXT: v_bfi_b32 v0, s4, 0, v1
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_f32_0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sign.trunc = fptrunc double %sign to float
+ %result = call float @llvm.copysign.f32(float 0.0, float %sign.trunc)
+ ret float %result
+}
+
+define amdgpu_ps i32 @s_copysign_f32_0_f16(half inreg %sign) {
+; SI-LABEL: s_copysign_f32_0_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_and_b32 s0, s0, 0x80000000
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: s_copysign_f32_0_f16:
+; VI: ; %bb.0:
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; VI-NEXT: v_readfirstlane_b32 s0, v0
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_f32_0_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %sign.ext = fpext half %sign to float
+ %result = call float @llvm.copysign.f32(float 0.0, float %sign.ext)
+ %cast = bitcast float %result to i32
+ ret i32 %cast
+}
+
+define float @v_copysign_f32_0_f16(half %sign) {
+; SI-LABEL: v_copysign_f32_0_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_copysign_f32_0_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; VI-NEXT: s_brev_b32 s4, -2
+; VI-NEXT: v_bfi_b32 v0, s4, 0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_f32_0_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sign.ext = fpext half %sign to float
+ %result = call float @llvm.copysign.f32(float 0.0, float %sign.ext)
+ ret float %result
+}
+
declare float @llvm.copysign.f32(float, float) #0
declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) #0
declare <3 x float> @llvm.copysign.v3f32(<3 x float>, <3 x float>) #0
@@ -1114,3 +1239,7 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) #0
declare <5 x float> @llvm.copysign.v5f32(<5 x float>, <5 x float>) #0
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 8b5c34d97e50e..b72eb5c5cf588 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -1063,4 +1063,135 @@ define <4 x double> @v_test_copysign_v4f64(ptr addrspace(1) %out, <4 x double> %
ret <4 x double> %result
}
+define amdgpu_ps <2 x i32> @s_copysign_f64_0_f64(double inreg %sign) {
+; SIVI-LABEL: s_copysign_f64_0_f64:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_and_b32 s1, s1, 0x80000000
+; SIVI-NEXT: s_mov_b32 s0, 0
+; SIVI-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_f64_0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_and_b32 s1, s1, 0x80000000
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: ; return to shader part epilog
+ %result = call double @llvm.copysign.f64(double 0.0, double %sign)
+ %cast = bitcast double %result to <2 x i32>
+ ret <2 x i32> %cast
+}
+
+define double @v_copysign_f64_0_f64(double %sign) {
+; SIVI-LABEL: v_copysign_f64_0_f64:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_and_b32_e32 v1, 0x80000000, v1
+; SIVI-NEXT: v_mov_b32_e32 v0, 0
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_f64_0_f64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x80000000, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %result = call double @llvm.copysign.f64(double 0.0, double %sign)
+ ret double %result
+}
+
+define amdgpu_ps <2 x i32> @s_copysign_f64_0_f32(float inreg %sign) {
+; SIVI-LABEL: s_copysign_f64_0_f32:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_and_b32 s1, s0, 0x80000000
+; SIVI-NEXT: s_mov_b32 s0, 0
+; SIVI-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_f64_0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_and_b32 s1, s0, 0x80000000
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: ; return to shader part epilog
+ %sign.ext = fpext float %sign to double
+ %result = call double @llvm.copysign.f64(double 0.0, double %sign.ext)
+ %cast = bitcast double %result to <2 x i32>
+ ret <2 x i32> %cast
+}
+
+define double @v_copysign_f64_0_f32(float %sign) {
+; SIVI-LABEL: v_copysign_f64_0_f32:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; SIVI-NEXT: v_mov_b32_e32 v0, 0
+; SIVI-NEXT: v_and_b32_e32 v1, 0x80000000, v1
+; SIVI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_f64_0_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x80000000, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sign.ext = fpext float %sign to double
+ %result = call double @llvm.copysign.f64(double 0.0, double %sign.ext)
+ ret double %result
+}
+
+define amdgpu_ps <2 x i32> @s_copysign_f64_0_f16(half inreg %sign) {
+; SI-LABEL: s_copysign_f64_0_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_and_b32 s1, s0, 0x80000000
+; SI-NEXT: s_mov_b32 s0, 0
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: s_copysign_f64_0_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_sext_i32_i16 s0, s0
+; VI-NEXT: s_and_b32 s1, s0, 0x80000000
+; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_copysign_f64_0_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_sext_i32_i16 s0, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s1, s0, 0x80000000
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: ; return to shader part epilog
+ %sign.ext = fpext half %sign to double
+ %result = call double @llvm.copysign.f64(double 0.0, double %sign.ext)
+ %cast = bitcast double %result to <2 x i32>
+ ret <2 x i32> %cast
+}
+
+define double @v_copysign_f64_0_f16(half %sign) {
+; SI-LABEL: v_copysign_f64_0_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_copysign_f64_0_f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: v_and_b32_e32 v1, 0x80000000, v1
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_copysign_f64_0_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x80000000, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sign.ext = fpext half %sign to double
+ %result = call double @llvm.copysign.f64(double 0.0, double %sign.ext)
+ ret double %result
+}
+
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
More information about the llvm-commits
mailing list