[llvm] [AMDGPU] Make fneg/fabs/copysign legal for bf16 (PR #91676)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Fri May 10 10:30:26 PDT 2024
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/91676
>From 9faa0108030f303130663886d91163f1651afce3 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 9 May 2024 15:33:54 -0700
Subject: [PATCH 1/3] [AMDGPU] Make fneg/fabs/copysign legal for bf16
These are just bit operations, exactly the same as with f16.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 4 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 29 ++
llvm/test/CodeGen/AMDGPU/bf16.ll | 378 +++++++-----------
4 files changed, 188 insertions(+), 230 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 1e9132bcfaf93..d35a022ad6806 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -946,14 +946,14 @@ bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
// Packed operations do not have a fabs modifier.
return VT == MVT::f32 || VT == MVT::f64 ||
- (Subtarget->has16BitInsts() && VT == MVT::f16);
+ (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
}
bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
assert(VT.isFloatingPoint());
// Report this based on the end legalized type.
VT = VT.getScalarType();
- return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
+ return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
}
bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 33bdd6195a040..0a3a56e9b3a0b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -225,10 +225,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::bf16, Promote);
AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
- // TODO: Could make these legal
- setOperationAction(ISD::FABS, MVT::bf16, Expand);
- setOperationAction(ISD::FNEG, MVT::bf16, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);
+ setOperationAction(ISD::FABS, MVT::bf16, Legal);
+ setOperationAction(ISD::FNEG, MVT::bf16, Legal);
+ setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Legal);
// We only need to custom lower because we can't specify an action for bf16
// sources.
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index cca8d96f29c0f..484ed62bfd7ca 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1947,6 +1947,20 @@ def : GCNPat <
(S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
>;
+def : GCNPat <
+ (UniformUnaryFrag<fneg> (bf16 SReg_32:$src)),
+ (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000)))
+>;
+
+def : GCNPat <
+ (UniformUnaryFrag<fabs> (bf16 SReg_32:$src)),
+ (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff)))
+>;
+
+def : GCNPat <
+ (UniformUnaryFrag<fneg> (fabs (bf16 SReg_32:$src))),
+ (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
+>;
// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead
// of the real value.
@@ -2045,6 +2059,21 @@ def : GCNPat <
(V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
>;
+def : GCNPat <
+ (fabs (bf16 VGPR_32:$src)),
+ (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+ (fneg (bf16 VGPR_32:$src)),
+ (V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+ (fneg (fabs (bf16 VGPR_32:$src))),
+ (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
+>;
+
def : GCNPat <
(fneg (v2f16 VGPR_32:$src)),
(V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 4c9c34de7194c..a86a3f6f279d7 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -17008,33 +17008,27 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
;
; GFX8-LABEL: s_fabs_bf16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0x7fff
-; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fabs_bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x7fff
-; GFX9-NEXT: v_and_b32_e32 v0, s0, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fabs_bf16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_and_b32_e64 v0, 0x7fff, s0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fabs_bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e64 v0, 0x7fff, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-NEXT: ; return to shader part epilog
%op = call bfloat @llvm.fabs.bf16(bfloat %a)
%cast = bitcast bfloat %op to i16
@@ -17059,25 +17053,25 @@ define bfloat @v_fneg_bf16(bfloat %a) {
; GFX8-LABEL: v_fneg_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fneg_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fneg_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fneg_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = fneg bfloat %a
ret bfloat %op
@@ -17089,49 +17083,41 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GCN-NEXT: v_mul_f32_e64 v0, -1.0, s0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
+; GFX7-NEXT: v_mul_f32_e64 v0, -1.0, s0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_bf16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fneg_bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fneg_bf16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_xor_b32_e64 v0, 0xffff8000, s0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fneg_bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_xor_b32_e64 v0, 0xffff8000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-NEXT: ; return to shader part epilog
%op = fneg bfloat %a
%cast = bitcast bfloat %op to i16
@@ -17166,25 +17152,25 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GFX8-LABEL: v_fneg_fabs_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
+; GFX8-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fneg_fabs_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
+; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fneg_fabs_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
+; GFX10-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fneg_fabs_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
%op = fneg bfloat %fabs
@@ -17196,48 +17182,48 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GCN-NEXT: s_bitset0_b32 s0, 31
+; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GCN-NEXT: s_lshr_b32 s0, s0, 16
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX7-NEXT: s_bitset0_b32 s0, 31
+; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000
+; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000
+; GFX7-NEXT: s_lshr_b32 s0, s0, 16
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_fabs_bf16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_bitset1_b32 s0, 15
+; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_fneg_fabs_bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_bitset1_b32 s0, 15
+; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fneg_fabs_bf16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_or_b32_e64 v0, 0xffff8000, s0
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_bitset1_b32 s0, 15
+; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fneg_fabs_bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_or_b32_e64 v0, 0xffff8000, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_bitset1_b32 s0, 15
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
; GFX11-NEXT: ; return to shader part epilog
%fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
%op = fneg bfloat %fabs
@@ -27280,34 +27266,27 @@ define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
; GFX8-LABEL: v_copysign_bf16_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_bf16_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_bf16_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_bf16_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
ret bfloat %op
@@ -27339,36 +27318,29 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
; GFX8-LABEL: v_copysign_bf16_s_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GFX8-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_movk_i32 s5, 0x7fff
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_bf16_s_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000
-; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: s_movk_i32 s5, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_bf16_s_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e64 v1, 0xffff8000, s4
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_bf16_s_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
ret bfloat %op
@@ -27400,36 +27372,29 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
; GFX8-LABEL: v_copysign_s_bf16_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x7fff
-; GFX8-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GFX8-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_movk_i32 s5, 0x7fff
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: v_bfi_b32 v0, s5, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_s_bf16_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fff
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GFX9-NEXT: v_and_b32_e32 v1, s4, v1
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT: s_movk_i32 s5, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_s_bf16_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GFX10-NEXT: v_and_b32_e64 v1, 0x7fff, s4
-; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_s_bf16_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff8000, v0
-; GFX11-NEXT: v_and_b32_e64 v1, 0x7fff, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
ret bfloat %op
@@ -27461,35 +27426,32 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
; GFX8-LABEL: v_copysign_bf16_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_bf16_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_bf16_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_bf16_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sign = fptrunc float %sign.f32 to bfloat
%op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -27522,35 +27484,32 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
; GFX8-LABEL: v_copysign_bf16_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: v_and_b32_e32 v1, 0x80000000, v2
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_bf16_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT: v_and_b32_e32 v1, 0x80000000, v2
-; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_bf16_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT: v_and_b32_e32 v1, 0x80000000, v2
-; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_bf16_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, 0x80000000, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sign = fptrunc double %sign.f64 to bfloat
%op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -27583,34 +27542,27 @@ define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
; GFX8-LABEL: v_copysign_bf16_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_bf16_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT: s_movk_i32 s4, 0x7fff
+; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_bf16_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_bf16_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff8000, v1
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sign = bitcast half %sign.f16 to bfloat
%op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign)
@@ -27640,41 +27592,37 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign
;
; GFX8-LABEL: s_copysign_bf16_bf16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x7fff
-; GFX8-NEXT: v_and_b32_e32 v0, s1, v0
-; GFX8-NEXT: v_and_b32_e32 v1, s0, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_movk_i32 s2, 0x7fff
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_bf16_bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fff
-; GFX9-NEXT: v_and_b32_e32 v0, s1, v0
-; GFX9-NEXT: v_and_b32_e32 v1, s0, v1
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT: s_movk_i32 s2, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_copysign_bf16_bf16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_and_b32_e64 v0, 0xffff8000, s1
-; GFX10-NEXT: v_and_b32_e64 v1, 0x7fff, s0
-; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_bf16_bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s1
-; GFX11-NEXT: v_and_b32_e64 v1, 0x7fff, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -27709,46 +27657,39 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
;
; GFX8-LABEL: s_copysign_bf16_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0x7fff
-; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT: s_and_b32 s0, s1, 0x80000000
-; GFX8-NEXT: s_lshr_b32 s0, s0, 16
-; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT: v_lshrrev_b32_e64 v0, 16, s1
+; GFX8-NEXT: s_movk_i32 s1, 0x7fff
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_bf16_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x7fff
-; GFX9-NEXT: v_and_b32_e32 v0, s0, v0
-; GFX9-NEXT: s_and_b32 s0, s1, 0x80000000
-; GFX9-NEXT: s_lshr_b32 s0, s0, 16
-; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s1
+; GFX9-NEXT: s_movk_i32 s1, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_copysign_bf16_f32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_and_b32_e64 v0, 0x7fff, s0
-; GFX10-NEXT: s_and_b32 s0, s1, 0x80000000
-; GFX10-NEXT: s_lshr_b32 s0, s0, 16
-; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s1
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_bf16_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e64 v0, 0x7fff, s0
-; GFX11-NEXT: s_and_b32 s0, s1, 0x80000000
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%sign = fptrunc float %sign.f32 to bfloat
@@ -27782,46 +27723,39 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
;
; GFX8-LABEL: s_copysign_bf16_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0x7fff
-; GFX8-NEXT: v_and_b32_e32 v0, s0, v0
-; GFX8-NEXT: s_and_b32 s0, s2, 0x80000000
-; GFX8-NEXT: s_lshr_b32 s0, s0, 16
-; GFX8-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT: v_lshrrev_b32_e64 v0, 16, s2
+; GFX8-NEXT: s_movk_i32 s1, 0x7fff
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_bf16_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x7fff
-; GFX9-NEXT: v_and_b32_e32 v0, s0, v0
-; GFX9-NEXT: s_and_b32 s0, s2, 0x80000000
-; GFX9-NEXT: s_lshr_b32 s0, s0, 16
-; GFX9-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s2
+; GFX9-NEXT: s_movk_i32 s1, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_copysign_bf16_f64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_and_b32_e64 v0, 0x7fff, s0
-; GFX10-NEXT: s_and_b32 s0, s2, 0x80000000
-; GFX10-NEXT: s_lshr_b32 s0, s0, 16
-; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX10-NEXT: v_lshrrev_b32_e64 v0, 16, s2
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_bf16_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e64 v0, 0x7fff, s0
-; GFX11-NEXT: s_and_b32 s0, s2, 0x80000000
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX11-NEXT: v_lshrrev_b32_e64 v0, 16, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%sign = fptrunc double %sign.f64 to bfloat
@@ -27855,41 +27789,37 @@ define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f1
;
; GFX8-LABEL: s_copysign_bf16_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT: v_mov_b32_e32 v1, 0x7fff
-; GFX8-NEXT: v_and_b32_e32 v0, s1, v0
-; GFX8-NEXT: v_and_b32_e32 v1, s0, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_movk_i32 s2, 0x7fff
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_bfi_b32 v0, s2, v0, v1
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_bf16_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fff
-; GFX9-NEXT: v_and_b32_e32 v0, s1, v0
-; GFX9-NEXT: v_and_b32_e32 v1, s0, v1
-; GFX9-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX9-NEXT: s_movk_i32 s2, 0x7fff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_copysign_bf16_f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_and_b32_e64 v0, 0xffff8000, s1
-; GFX10-NEXT: v_and_b32_e64 v1, 0x7fff, s0
-; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s1
+; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_bf16_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s1
-; GFX11-NEXT: v_and_b32_e64 v1, 0x7fff, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
@@ -33706,7 +33636,7 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -33715,7 +33645,7 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -33724,7 +33654,7 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -33733,7 +33663,7 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
@@ -33770,7 +33700,7 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v2
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -33779,7 +33709,7 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -33788,7 +33718,7 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX10-NEXT: v_xor_b32_e32 v2, 0x8000, v2
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -33797,7 +33727,7 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
+; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
>From 687f5681bac1989880bccbdd675ad3c8c44ec3eb Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Fri, 10 May 2024 00:34:44 -0700
Subject: [PATCH 2/3] Collapse patterns with a td loop
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 23 +++++------------------
1 file changed, 5 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 484ed62bfd7ca..1b0fab2f851dc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2044,35 +2044,22 @@ def : GCNPat <
(V_XOR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src)
>;
+foreach fp16vt = [f16, bf16] in {
def : GCNPat <
- (fabs (f16 VGPR_32:$src)),
- (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
->;
-
-def : GCNPat <
- (fneg (f16 VGPR_32:$src)),
- (V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
->;
-
-def : GCNPat <
- (fneg (fabs (f16 VGPR_32:$src))),
- (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
->;
-
-def : GCNPat <
- (fabs (bf16 VGPR_32:$src)),
+ (fabs (fp16vt VGPR_32:$src)),
(V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
>;
def : GCNPat <
- (fneg (bf16 VGPR_32:$src)),
+ (fneg (fp16vt VGPR_32:$src)),
(V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
>;
def : GCNPat <
- (fneg (fabs (bf16 VGPR_32:$src))),
+ (fneg (fabs (fp16vt VGPR_32:$src))),
(V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
>;
+} // End foreach fp16vt = ...
def : GCNPat <
(fneg (v2f16 VGPR_32:$src)),
>From 00362472fe8e9ebe8b9c7ff3f73a69236c629ec6 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Fri, 10 May 2024 10:30:05 -0700
Subject: [PATCH 3/3] Collapse 3 more patterns with a td loop
---
llvm/lib/Target/AMDGPU/SIInstructions.td | 22 +++++-----------------
1 file changed, 5 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1b0fab2f851dc..7592f8da1b4f2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1908,20 +1908,22 @@ def : GCNPat <
(S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000)))
>;
+foreach fp16vt = [f16, bf16] in {
def : GCNPat <
- (UniformUnaryFrag<fneg> (f16 SReg_32:$src)),
+ (UniformUnaryFrag<fneg> (fp16vt SReg_32:$src)),
(S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000)))
>;
def : GCNPat <
- (UniformUnaryFrag<fabs> (f16 SReg_32:$src)),
+ (UniformUnaryFrag<fabs> (fp16vt SReg_32:$src)),
(S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff)))
>;
def : GCNPat <
- (UniformUnaryFrag<fneg> (fabs (f16 SReg_32:$src))),
+ (UniformUnaryFrag<fneg> (fabs (fp16vt SReg_32:$src))),
(S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
>;
+} // End foreach fp16vt = ...
def : GCNPat <
(UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)),
@@ -1947,20 +1949,6 @@ def : GCNPat <
(S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
>;
-def : GCNPat <
- (UniformUnaryFrag<fneg> (bf16 SReg_32:$src)),
- (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000)))
->;
-
-def : GCNPat <
- (UniformUnaryFrag<fabs> (bf16 SReg_32:$src)),
- (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff)))
->;
-
-def : GCNPat <
- (UniformUnaryFrag<fneg> (fabs (bf16 SReg_32:$src))),
- (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
->;
// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead
// of the real value.
More information about the llvm-commits
mailing list