[llvm-branch-commits] [llvm] AMDGPU: Improve v4f16/v4bf16 copysign handling (PR #142174)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri May 30 08:54:44 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
---
Patch is 284.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142174.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+13-11)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+938-1162)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+1059-1305)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ab3c316f76deb..1c30d3f3bd883 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -757,7 +757,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FABS, MVT::v2f16, Legal);
// Can do this in one BFI plus a constant materialize.
- setOperationAction(ISD::FCOPYSIGN, {MVT::v2f16, MVT::v2bf16}, Custom);
+ setOperationAction(ISD::FCOPYSIGN,
+ {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16},
+ Custom);
setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
@@ -5936,10 +5938,11 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
- assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
- VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
- VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
+ VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
+ VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 ||
+ VT == MVT::v16f32 || VT == MVT::v32f32 || VT == MVT::v32i16 ||
+ VT == MVT::v32f16);
auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
@@ -7122,18 +7125,17 @@ SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
SDValue Mag = Op.getOperand(0);
- SDValue Sign = Op.getOperand(1);
-
EVT MagVT = Mag.getValueType();
- EVT SignVT = Sign.getValueType();
- assert(MagVT.isVector());
+ if (MagVT.getVectorNumElements() > 2)
+ return splitBinaryVectorOp(Op, DAG);
+
+ SDValue Sign = Op.getOperand(1);
+ EVT SignVT = Sign.getValueType();
if (MagVT == SignVT)
return Op;
- assert(MagVT.getVectorNumElements() == 2);
-
// fcopysign v2f16:mag, v2f32:sign ->
// fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index a5a36d7122f68..3bc1232ce3ed1 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -1090,40 +1090,26 @@ define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x
;
; GFX8-LABEL: s_copysign_v3bf16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshr_b32 s1, s2, 16
-; GFX8-NEXT: s_lshr_b32 s3, s0, 16
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_v3bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: s_lshr_b32 s1, s2, 16
-; GFX9-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: ; return to shader part epilog
@@ -1131,33 +1117,19 @@ define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x
; GFX10-LABEL: s_copysign_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: s_lshr_b32 s2, s2, 16
-; GFX10-NEXT: v_mov_b32_e32 v2, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
-; GFX10-NEXT: s_lshr_b32 s0, s0, 16
-; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s1, v2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
+; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_v3bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s1, v2
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
@@ -1238,101 +1210,49 @@ define amdgpu_ps <2 x i32> @s_copysign_v4bf16(<4 x bfloat> inreg %arg_mag, <4 x
;
; GFX8-LABEL: s_copysign_v4bf16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_lshr_b32 s3, s3, 16
-; GFX8-NEXT: s_lshr_b32 s1, s1, 16
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: s_lshr_b32 s1, s2, 16
-; GFX8-NEXT: s_lshr_b32 s0, s0, 16
; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_bfi_b32 v2, s4, v2, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_v4bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_lshr_b32 s3, s3, 16
-; GFX9-NEXT: s_lshr_b32 s1, s1, 16
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: s_lshr_b32 s1, s2, 16
-; GFX9-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_copysign_v4bf16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_mov_b32_e32 v0, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: s_lshr_b32 s3, s3, 16
-; GFX10-NEXT: s_lshr_b32 s2, s2, 16
-; GFX10-NEXT: v_mov_b32_e32 v2, s3
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, s2
-; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
-; GFX10-NEXT: s_lshr_b32 s1, s1, 16
-; GFX10-NEXT: s_lshr_b32 s0, s0, 16
-; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2
-; GFX10-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
-; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
+; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_v4bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
-; GFX11-NEXT: s_lshr_b32 s1, s1, 16
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2
-; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1
-; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v1
-; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: ; return to shader part epilog
%out = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %arg_mag, <4 x bfloat> %arg_sign)
%cast = bitcast <4 x bfloat> %out to <2 x i32>
@@ -2366,67 +2286,32 @@ define <3 x bfloat> @v_copysign_v3bf16(<3 x bfloat> %mag, <3 x bfloat> %sign) {
; GFX8-LABEL: v_copysign_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT: v_bfi_b32 v3, s4, v4, v3
+; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3
-; GFX9-NEXT: v_bfi_b32 v3, s4, v0, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2
-; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3
-; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, v5, v4
-; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
+; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: v_copysign_v3bf16:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
-; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v5
-; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v0, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_copysign_v3bf16:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2
-; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v5, v4
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_copysign_v3bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> %mag, <3 x bfloat> %sign)
ret <3 x bfloat> %result
}
@@ -2501,93 +2386,32 @@ define <4 x bfloat> @v_copysign_v4bf16(<4 x bfloat> %mag, <4 x bfloat> %sign) {
; GFX8-LABEL: v_copysign_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_bfi_b32 v4, s4, v5, v4
-; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX8-NEXT: v_bfi_b32 v3, s4, v5, v3
+; GFX8-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfi_b32 v4, s4, v1, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3
-; GFX9-NEXT: v_bfi_b32 v3, s4, v0, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
-; GFX9-NEXT: s_mov_b32 s4, 0x5040100
-; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4
+; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_v4bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX10-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2
-; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, v6, v5
-; GFX10-NEXT: v_bfi_b32 v3, 0x7fff, v7, v4
-; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX10-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
+; GFX10-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11TRUE16-LABEL: v_copysign_v4bf16:
-; GFX11TRUE16: ; %bb.0:
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h
-; GFX11TRUE16-NEXT: v_bfi_b32 v6, 0x7fff, v6, v7
-; GFX11TRUE16-NEXT: v_bfi_b32 v4, 0x7fff, v4, v5
-; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v0, v2
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v1, v3
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
-; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_copysign_v4bf16:
-; GFX11FAKE16: ; %bb.0:
-; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3
-; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v6, v5
-; GFX11FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, v7, v4
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
-; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_copysign_v4bf16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x bfloat...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/142174
More information about the llvm-branch-commits
mailing list