[llvm] [AMDGPU] Make fneg/fabs/copysign legal for bf16 (PR #91676)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 9 15:55:28 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Stanislav Mekhanoshin (rampitec)
<details>
<summary>Changes</summary>
These are just bit operations, exactly the same as with f16.
---
Patch is 36.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/91676.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+2-2)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+3-4)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+29)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+154-224)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1e9132bcfaf93..d35a022ad6806 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -946,14 +946,14 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
// Packed operations do not have a fabs modifier.
return VT == MVT::f32 || VT == MVT::f64 ||
- (Subtarget->has16BitInsts() && VT == MVT::f16);
+ (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
}
bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
assert(VT.isFloatingPoint());
// Report this based on the end legalized type.
VT = VT.getScalarType();
- return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
+ return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
}
bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 33bdd6195a040..0a3a56e9b3a0b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -225,10 +225,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::bf16, Promote);
AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
- // TODO: Could make these legal
- setOperationAction(ISD::FABS, MVT::bf16, Expand);
- setOperationAction(ISD::FNEG, MVT::bf16, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
+ setOperationAction(ISD::FABS, MVT::bf16, Legal);
+ setOperationAction(ISD::FNEG, MVT::bf16, Legal);
+ setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Legal);
// We only need to custom lower because we can't specify an action for bf16
// sources.
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cca8d96f29c0f..484ed62bfd7ca 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1947,6 +1947,20 @@ def : GCNPat <
(S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
>;
+def : GCNPat <
+ (UniformUnaryFrag<fneg> (bf16 SReg_32:$src)),
+ (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000)))
+>;
+
+def : GCNPat <
+ (UniformUnaryFrag<fabs> (bf16 SReg_32:$src)),
+ (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff)))
+>;
+
+def : GCNPat <
+ (UniformUnaryFrag<fneg> (fabs (bf16 SReg_32:$src))),
+ (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
+>;
// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead
// of the real value.
@@ -2045,6 +2059,21 @@ def : GCNPat <
(V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
>;
+def : GCNPat <
+ (fabs (bf16 VGPR_32:$src)),
+ (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+ (fneg (bf16 VGPR_32:$src)),
+ (V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+ (fneg (fabs (bf16 VGPR_32:$src))),
+ (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
+>;
+
def : GCNPat <
(fneg (v2f16 VGPR_32:$src)),
(V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 4c9c34de7194c..a86a3f6f279d7 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -17008,33 +17008,27 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
;
; GFX8-LABEL: s_fabs_bf16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0x7fff
-; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fabs_bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x7fff
-; GFX9-NEXT: v_and_b32_e32 v0, s0, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fabs_bf16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_and_b32_e64 v0, 0x7fff, s0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fabs_bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e64 v0, 0x7fff, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-NEXT: ; return to shader part epilog
%op = call bfloat @llvm.fabs.bf16(bfloat %a)
%cast = bitcast bfloat %op to i16
@@ -17059,25 +17053,25 @@ define bfloat @v_fneg_bf16(bfloat %a) {
; GFX8-LABEL: v_fneg_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fneg_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fneg_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fneg_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fneg bfloat %a
ret bfloat %op
@@ -17089,49 +17083,41 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GCN-NEXT: v_mul_f32_e64 v0, -1.0, s0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GFX7-NEXT: v_mul_f32_e64 v0, -1.0, s0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_bf16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fneg_bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fneg_bf16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_xor_b32_e64 v0, 0xffff8000, s0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fneg_bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_xor_b32_e64 v0, 0xffff8000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-NEXT: ; return to shader part epilog
%op = fneg bfloat %a
%cast = bitcast bfloat %op to i16
@@ -17166,25 +17152,25 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GFX8-LABEL: v_fneg_fabs_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fneg_fabs_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fneg_fabs_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fneg_fabs_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
%op = fneg bfloat %fabs
@@ -17196,48 +17182,48 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GCN-NEXT: s_bitset0_b32 s0, 31
+; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GCN-NEXT: s_lshr_b32 s0, s0, 16
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX7-NEXT: s_bitset0_b32 s0, 31
+; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_fabs_bf16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_bitset1_b32 s0, 15
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fneg_fabs_bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_bitset1_b32 s0, 15
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fneg_fabs_bf16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_or_b32_e64 v0, 0xffff8000, s0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_bitset1_b32 s0, 15
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fneg_fabs_bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_or_b32_e64 v0, 0xffff8000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_bitset1_b32 s0, 15
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-NEXT: ; return to shader part epilog
%fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
%op = fneg bfloat %fabs
@@ -27280,34 +27266,27 @@ define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
; GFX8-LABEL: v_copysign_bf16_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_bf16_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_bf16_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_bf16_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
ret bfloat %op
@@ -27339,36 +27318,29 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
; GFX8-LABEL: v_copysign_bf16_s_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GFX8-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_movk_i32 s5, 0x7fff
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_bf16_s_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: s_movk_i32 s5, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_bf16_s_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e64 v1, 0xffff8000, s4
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_bf16_s_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
ret bfloat %op
@@ -27400,36 +27372,29 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
; GFX8-LABEL: v_copysign_s_bf16_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x7fff
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GFX8-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_movk_i32 s5, 0x7fff
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: v_bfi_b32 v0, s5, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_s_bf16_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fff
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT: s_movk_i32 s5, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_s_bf16_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GFX10-NEXT: v_and_b32_e64 v1, 0x7fff, s4
-; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_s_bf16_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GFX11-NEXT: v_and_b32_e64 v1, 0x7fff, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
ret bfloat %op
@@ -27461,35 +27426,32 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
; GFX8-LABEL: v_copysign_bf16_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_bf16_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_bf16_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_bf16_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sign = fptrunc float %sign.f32 to bfloat
%op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -27522,35 +27484,32 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
; GFX8-LABEL: v_copysign_bf16_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 0x80000000, v2
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DW...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/91676
More information about the llvm-commits
mailing list