[llvm] AMDGPU: Handle vectors in copysign magnitude sign case (PR #142156)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri May 30 10:56:33 PDT 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/142156
>From 3fe89574d9e946ed644b242c000b045f396d148a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 30 May 2025 12:03:35 +0200
Subject: [PATCH] AMDGPU: Handle vectors in copysign magnitude sign case
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 58 +++-
.../AMDGPU/copysign-simplify-demanded-bits.ll | 2 +-
llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 294 +++++++-----------
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 179 +++++------
4 files changed, 242 insertions(+), 291 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 74ca3e43fce3a..af85c6bef273d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11721,29 +11721,63 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SDValue MagnitudeOp = N->getOperand(0);
SDValue SignOp = N->getOperand(1);
+
+ // The generic combine for fcopysign + fp cast is too conservative with
+ // vectors, and also gets confused by the splitting we will perform here, so
+ // peek through FP casts.
+ if (SignOp.getOpcode() == ISD::FP_EXTEND ||
+ SignOp.getOpcode() == ISD::FP_ROUND)
+ SignOp = SignOp.getOperand(0);
+
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
+ EVT SignVT = SignOp.getValueType();
// f64 fcopysign is really an f32 copysign on the high bits, so replace the
// lower half with a copy.
// fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
- if (MagnitudeOp.getValueType() == MVT::f64) {
- SDValue MagAsVector =
- DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
- SDValue MagLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
- MagAsVector, DAG.getConstant(0, DL, MVT::i32));
- SDValue MagHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
- MagAsVector, DAG.getConstant(1, DL, MVT::i32));
+ EVT MagVT = MagnitudeOp.getValueType();
+ if (MagVT.getScalarType() == MVT::f64) {
+ unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
+
+ EVT F32VT = MagVT.isVector()
+ ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
+ : MVT::v2f32;
+
+ SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
+
+ SmallVector<SDValue, 8> NewElts;
+ for (unsigned I = 0; I != NumElts; ++I) {
+ SDValue MagLo =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+ DAG.getConstant(2 * I, DL, MVT::i32));
+ SDValue MagHi =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+ DAG.getConstant(2 * I + 1, DL, MVT::i32));
- SDValue HiOp = DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
+ SDValue SignOpElt =
+ MagVT.isVector()
+ ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SignVT.getScalarType(),
+ SignOp, DAG.getConstant(I, DL, MVT::i32))
+ : SignOp;
+
+ SDValue HiOp =
+ DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
+
+ SDValue Vector =
+ DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+
+ SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+ NewElts.push_back(NewElt);
+ }
- SDValue Vector =
- DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+ if (NewElts.size() == 1)
+ return NewElts[0];
- return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+ return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
}
- if (SignOp.getValueType() != MVT::f64)
+ if (SignVT != MVT::f64)
return SDValue();
// Reduce width of sign operand, we only need the highest bit.
diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
index a01c2fa152ab3..15b049d4d7563 100644
--- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
+++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll
@@ -131,8 +131,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2
; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_or_b32_e32 v6, 1, v5
; GFX9-NEXT: v_or_b32_e32 v4, 1, v4
+; GFX9-NEXT: v_or_b32_e32 v6, 1, v5
; GFX9-NEXT: v_cvt_f64_i32_e32 v[4:5], v4
; GFX9-NEXT: v_cvt_f64_i32_e32 v[6:7], v6
; GFX9-NEXT: s_brev_b32 s4, -2
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 32e3f72af516f..3bd068362410b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -4055,50 +4055,38 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> %ma
; GCN-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v4
-; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v5
-; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GCN-NEXT: v_bfi_b32 v1, s4, v1, v7
+; GCN-NEXT: v_bfi_b32 v1, s4, v1, v4
; GCN-NEXT: v_bfi_b32 v3, s4, v3, v5
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[4:5], v4
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
; GFX7-NEXT: s_brev_b32 s4, -2
-; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v5
-; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v7
+; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v4
+; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX8-NEXT: s_brev_b32 s4, -2
-; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v5
; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -4969,71 +4957,63 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2bf16(<2 x floa
define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x double> inreg %mag, <2 x bfloat> inreg %sign) {
; GCN-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s5
-; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4
-; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_mov_b32_e32 v4, s3
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
-; GCN-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GCN-NEXT: v_bfi_b32 v0, s4, v4, v3
-; GCN-NEXT: v_bfi_b32 v1, s4, v5, v1
+; GCN-NEXT: s_brev_b32 s6, -2
+; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s4
+; GCN-NEXT: v_bfi_b32 v0, s6, v0, v1
+; GCN-NEXT: v_bfi_b32 v1, s6, v2, v3
; GCN-NEXT: v_readfirstlane_b32 s1, v1
; GCN-NEXT: v_readfirstlane_b32 s3, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s5
-; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX7-NEXT: s_brev_b32 s4, -2
+; GFX7-NEXT: s_brev_b32 s6, -2
; GFX7-NEXT: v_mov_b32_e32 v0, s3
-; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GFX7-NEXT: v_mov_b32_e32 v1, s5
+; GFX7-NEXT: v_bfi_b32 v0, s6, v0, v1
; GFX7-NEXT: v_mov_b32_e32 v1, s1
-; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v3
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: v_bfi_b32 v1, s6, v1, v2
; GFX7-NEXT: v_readfirstlane_b32 s1, v1
; GFX7-NEXT: v_readfirstlane_b32 s3, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_lshr_b32 s5, s4, 16
; GFX8-NEXT: v_lshlrev_b32_e64 v0, 16, s4
-; GFX8-NEXT: s_brev_b32 s4, -2
+; GFX8-NEXT: s_brev_b32 s5, -2
; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_bfi_b32 v0, s4, v1, v0
-; GFX8-NEXT: v_lshlrev_b32_e64 v1, 16, s5
+; GFX8-NEXT: s_lshr_b32 s1, s4, 16
+; GFX8-NEXT: v_bfi_b32 v0, s5, v1, v0
+; GFX8-NEXT: v_lshlrev_b32_e64 v1, 16, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_bfi_b32 v1, s4, v2, v1
+; GFX8-NEXT: v_bfi_b32 v1, s5, v2, v1
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s3, v1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s5, s4, 16
; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s4
-; GFX9-NEXT: s_brev_b32 s4, -2
+; GFX9-NEXT: s_brev_b32 s5, -2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0
-; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s5
+; GFX9-NEXT: s_lshr_b32 s1, s4, 16
+; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0
+; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v1
+; GFX9-NEXT: v_bfi_b32 v1, s5, v2, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: v_readfirstlane_b32 s3, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshr_b32 s5, s4, 16
; GFX10-NEXT: v_lshlrev_b32_e64 v0, 16, s4
-; GFX10-NEXT: v_lshlrev_b32_e64 v1, 16, s5
+; GFX10-NEXT: s_lshr_b32 s4, s4, 16
+; GFX10-NEXT: v_lshlrev_b32_e64 v1, 16, s4
; GFX10-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
; GFX10-NEXT: v_readfirstlane_b32 s1, v0
@@ -5042,14 +5022,15 @@ define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2bf16(<2 x doub
;
; GFX11-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s5, s4, 16
; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4
-; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s4
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-NEXT: ; return to shader part epilog
%sign.ext = fpext <2 x bfloat> %sign to <2 x double>
@@ -5886,99 +5867,88 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3bf16(<3 x double> %ma
; GCN-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v7
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v6
-; GCN-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
-; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v9
-; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GCN-NEXT: v_bfi_b32 v1, s4, v1, v11
-; GCN-NEXT: v_bfi_b32 v3, s4, v3, v9
-; GCN-NEXT: v_bfi_b32 v5, s4, v5, v7
+; GCN-NEXT: v_bfi_b32 v1, s4, v1, v6
+; GCN-NEXT: v_bfi_b32 v3, s4, v3, v7
+; GCN-NEXT: v_bfi_b32 v5, s4, v5, v8
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v8
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
; GFX7-NEXT: s_brev_b32 s4, -2
-; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v7
-; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v9
-; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v11
+; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v6
+; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v7
+; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v8
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX8-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX8-NEXT: s_brev_b32 s4, -2
-; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v7
-; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v9
-; GFX8-NEXT: v_bfi_b32 v5, s4, v5, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v8
+; GFX8-NEXT: v_bfi_b32 v5, s4, v5, v7
+; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v6
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v6
-; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v7
-; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v9
-; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v8
+; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v7
+; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v6
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[8:9], v9
-; GFX10-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7
-; GFX10-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v9
-; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v11
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6
+; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7
+; GFX10-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v6
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v9
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v9
-; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v11
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7
+; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v8
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v6
+; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7
+; GFX11FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8
+; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.ext = fpext <3 x bfloat> %sign to <3 x double>
%out = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign.ext)
ret <3 x double> %out
@@ -7060,76 +7030,52 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4bf16(<4 x double> %ma
; GCN-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: s_brev_b32 s4, -2
-; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v9
-; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v8
-; GCN-NEXT: v_cvt_f64_f32_e32 v[8:9], v11
-; GCN-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GCN-NEXT: v_cvt_f64_f32_e32 v[12:13], v12
-; GCN-NEXT: v_cvt_f64_f32_e32 v[14:15], v14
-; GCN-NEXT: v_bfi_b32 v1, s4, v1, v15
-; GCN-NEXT: v_bfi_b32 v3, s4, v3, v13
-; GCN-NEXT: v_bfi_b32 v5, s4, v5, v11
-; GCN-NEXT: v_bfi_b32 v7, s4, v7, v9
+; GCN-NEXT: v_bfi_b32 v1, s4, v1, v8
+; GCN-NEXT: v_bfi_b32 v3, s4, v3, v9
+; GCN-NEXT: v_bfi_b32 v5, s4, v5, v10
+; GCN-NEXT: v_bfi_b32 v7, s4, v7, v11
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v11
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[8:9], v8
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v12
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[12:13], v13
-; GFX7-NEXT: v_cvt_f64_f32_e32 v[14:15], v8
; GFX7-NEXT: s_brev_b32 s4, -2
-; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v11
-; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v13
-; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v15
-; GFX7-NEXT: v_bfi_b32 v7, s4, v7, v9
+; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v8
+; GFX7-NEXT: v_bfi_b32 v3, s4, v3, v9
+; GFX7-NEXT: v_bfi_b32 v5, s4, v5, v10
+; GFX7-NEXT: v_bfi_b32 v7, s4, v7, v11
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX8-NEXT: s_brev_b32 s4, -2
-; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v9
-; GFX8-NEXT: v_bfi_b32 v5, s4, v5, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX8-NEXT: v_bfi_b32 v3, s4, v3, v8
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_bfi_b32 v5, s4, v5, v10
; GFX8-NEXT: v_bfi_b32 v7, s4, v7, v8
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v9
-; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v10
; GFX9-NEXT: v_bfi_b32 v7, s4, v7, v8
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index c8de7bc9d9de6..9d031a879d938 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -3365,11 +3365,7 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> %mag
; SI-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: s_brev_b32 s4, -2
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_bfi_b32 v1, s4, v1, v4
; SI-NEXT: v_bfi_b32 v3, s4, v3, v5
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3377,22 +3373,22 @@ define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> %mag
; VI-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; VI-NEXT: s_brev_b32 s4, -2
-; VI-NEXT: v_bfi_b32 v1, s4, v1, v4
-; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; VI-NEXT: v_bfi_b32 v1, s4, v1, v5
; VI-NEXT: v_bfi_b32 v3, s4, v3, v4
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4
-; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -4294,57 +4290,56 @@ define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2f16(<2 x float
define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> inreg %mag, <2 x half> inreg %sign) {
; SI-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s5
-; SI-NEXT: v_cvt_f16_f32_e32 v1, s4
-; SI-NEXT: s_brev_b32 s4, -2
-; SI-NEXT: v_mov_b32_e32 v2, s3
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_bfi_b32 v0, s4, v2, v0
-; SI-NEXT: v_mov_b32_e32 v2, s1
-; SI-NEXT: v_bfi_b32 v1, s4, v2, v1
+; SI-NEXT: s_brev_b32 s6, -2
+; SI-NEXT: v_mov_b32_e32 v0, s3
+; SI-NEXT: v_mov_b32_e32 v1, s5
+; SI-NEXT: v_bfi_b32 v0, s6, v0, v1
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_mov_b32_e32 v2, s4
+; SI-NEXT: v_bfi_b32 v1, s6, v1, v2
; SI-NEXT: v_readfirstlane_b32 s1, v1
; SI-NEXT: v_readfirstlane_b32 s3, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_lshr_b32 s5, s4, 16
; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s4
-; VI-NEXT: s_brev_b32 s4, -2
+; VI-NEXT: s_brev_b32 s5, -2
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_bfi_b32 v0, s4, v1, v0
-; VI-NEXT: v_lshlrev_b32_e64 v1, 16, s5
+; VI-NEXT: s_lshr_b32 s1, s4, 16
+; VI-NEXT: v_bfi_b32 v0, s5, v1, v0
+; VI-NEXT: v_lshlrev_b32_e64 v1, 16, s1
; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_bfi_b32 v1, s4, v2, v1
+; VI-NEXT: v_bfi_b32 v1, s5, v2, v1
; VI-NEXT: v_readfirstlane_b32 s1, v0
; VI-NEXT: v_readfirstlane_b32 s3, v1
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_lshr_b32 s5, s4, 16
; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s4
-; GFX9-NEXT: s_brev_b32 s4, -2
+; GFX9-NEXT: s_brev_b32 s5, -2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0
-; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s5
+; GFX9-NEXT: s_lshr_b32 s1, s4, 16
+; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0
+; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_bfi_b32 v1, s4, v2, v1
+; GFX9-NEXT: v_bfi_b32 v1, s5, v2, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: v_readfirstlane_b32 s3, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_lshr_b32 s5, s4, 16
; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4
-; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: s_lshr_b32 s4, s4, 16
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e64 v1, 16, s4
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-NEXT: ; return to shader part epilog
%sign.ext = fpext <2 x half> %sign to <2 x double>
@@ -5206,13 +5201,7 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3f16(<3 x double> %mag
; SI-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; SI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
; SI-NEXT: s_brev_b32 s4, -2
-; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_bfi_b32 v1, s4, v1, v6
; SI-NEXT: v_bfi_b32 v3, s4, v3, v7
; SI-NEXT: v_bfi_b32 v5, s4, v5, v8
@@ -5221,67 +5210,57 @@ define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3f16(<3 x double> %mag
; VI-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v8, v6
-; VI-NEXT: v_cvt_f32_f16_e32 v9, v7
-; VI-NEXT: v_cvt_f32_f16_sdwa v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; VI-NEXT: s_brev_b32 s4, -2
-; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
-; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v9
-; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; VI-NEXT: v_bfi_b32 v1, s4, v1, v7
-; VI-NEXT: v_bfi_b32 v5, s4, v5, v9
-; VI-NEXT: v_bfi_b32 v3, s4, v3, v11
+; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; VI-NEXT: v_bfi_b32 v1, s4, v1, v8
+; VI-NEXT: v_bfi_b32 v5, s4, v5, v7
+; VI-NEXT: v_bfi_b32 v3, s4, v3, v6
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v8, v6
-; GFX9-NEXT: v_cvt_f32_f16_e32 v9, v7
-; GFX9-NEXT: v_cvt_f32_f16_sdwa v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v6
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v9
-; GFX9-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v7
-; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v9
-; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v8
+; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v7
+; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v6
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v8, v6.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v9, v7.l
-; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v10, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[6:7], v8
-; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[8:9], v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
-; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v9
-; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v11
+; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v6
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v6
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v9, v7
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v10, v8
-; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[6:7], v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[8:9], v9
-; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[10:11], v10
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7
-; GFX11-FAKE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v9
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v11
+; GFX11-FAKE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7
+; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.ext = fpext <3 x half> %sign to <3 x double>
%out = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign.ext)
@@ -6483,14 +6462,6 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4f16(<4 x double> %mag
; SI-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; SI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; SI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; SI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v1, s4, v1, v8
; SI-NEXT: v_bfi_b32 v3, s4, v3, v9
@@ -6501,32 +6472,32 @@ define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4f16(<4 x double> %mag
; VI-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v8
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; VI-NEXT: s_brev_b32 s4, -2
-; VI-NEXT: v_bfi_b32 v1, s4, v1, v8
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v9
-; VI-NEXT: v_bfi_b32 v5, s4, v5, v8
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v11
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_bfi_b32 v3, s4, v3, v8
-; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; VI-NEXT: v_bfi_b32 v1, s4, v1, v10
+; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; VI-NEXT: v_bfi_b32 v5, s4, v5, v10
; VI-NEXT: v_bfi_b32 v7, s4, v7, v8
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX9-NEXT: s_brev_b32 s4, -2
-; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v9
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v9
-; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v11
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v8
-; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v10
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9
+; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v10
+; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
+; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v10
; GFX9-NEXT: v_bfi_b32 v7, s4, v7, v8
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
More information about the llvm-commits
mailing list