[llvm-branch-commits] [llvm] AMDGPU: Add pattern for copysign of 0 (PR #172699)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Dec 17 09:32:42 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
Avoiding v_bfi_b32 is desirable since on gfx9 it
requires materializing the constant.
Similar could be done for infinity, with or 0x7fffffff
---
Patch is 157.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/172699.diff
7 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+8)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+88)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+43-76)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+49-87)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll (+8-13)
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+418-495)
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+4-4)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 628b972f97086..2ec3ec8674488 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -956,6 +956,9 @@ def InlineImmFP64 : FPImmLeaf<f64, [{
return isInlineImmediate(Imm);
}]>;
+def fpimm_pos_zero : FPImmLeaf<fAny, [{
+ return Imm.isZero() && !Imm.isNegative();
+}]>;
class VGPRImm <dag frag> : PatLeaf<frag, [{
return isVGPRImm(N);
@@ -984,6 +987,11 @@ def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
return fp16SrcZerosHighBits(N->getOpcode());
}]>;
+
+class build_vector_fpimm_pos_zero_v2<VTVec vec> : PatLeaf<
+ (vec (build_vector (vec.ElementType fpimm_pos_zero),
+ (vec.ElementType fpimm_pos_zero)))>;
+
def MFMALdScaleXForm : SDNodeXForm<timm, [{
unsigned Val = N->getZExtValue();
unsigned New = 0;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ac6f35a997264..22564b8347110 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2275,12 +2275,34 @@ def : GCNPat <
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
>;
+def : GCNPat <
+ (UniformBinFrag<fcopysign> (fp16vt fpimm_pos_zero), fp16vt:$src1),
+ (S_AND_B32 (S_MOV_B32 (i32 0x00008000)), SReg_32:$src1)
+>;
+
+def : GCNPat <
+ (fcopysign (fp16vt fpimm_pos_zero), fp16vt:$src1),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src1)
+>;
+
def : GCNPat <
(fcopysign f32:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
(V_LSHLREV_B32_e64 (i32 16), $src1))
>;
+def : GCNPat <
+ (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), fp16vt:$src1),
+ (S_AND_B32 (S_MOV_B32 (i32 0x80000000)),
+ (S_LSHL_B32 SReg_32:$src1, (i32 16)))
+>;
+
+def : GCNPat <
+ (fcopysign (f32 fpimm_pos_zero), fp16vt:$src1),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x80000000)),
+ (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1))
+>;
+
def : GCNPat <
(fcopysign f64:$src0, fp16vt:$src1),
(REG_SEQUENCE SReg_64,
@@ -2295,6 +2317,18 @@ def : GCNPat <
(V_LSHRREV_B32_e64 (i32 16), $src1))
>;
+def : GCNPat <
+ (UniformBinFrag<fcopysign> (fp16vt fpimm_pos_zero), f32:$src1),
+ (S_AND_B32 (S_MOV_B32 (i32 0x00008000)),
+ (S_LSHR_B32 SReg_32:$src1, (i32 16)))
+>;
+
+def : GCNPat <
+ (fcopysign (fp16vt fpimm_pos_zero), f32:$src1),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x00008000)),
+ (V_LSHRREV_B32_e64 (i32 16), VGPR_32:$src1))
+>;
+
def : GCNPat <
(fcopysign fp16vt:$src0, f64:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
@@ -2309,12 +2343,27 @@ def : GCNPat <
(REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16)
>;
+// TODO: Scalar case for 0 magnitude special case
+def : GCNPat <
+ (fcopysign (fp16vt fpimm_pos_zero), fp16vt:$src1),
+ (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)), 0, VGPR_16:$src1)
+>;
+
+// TODO: Handle 0 magnitude special case
def : GCNPat <
(fcopysign f32:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
(REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16))
>;
+// TODO: Scalar case for 0 magnitude special case
+def : GCNPat <
+ (fcopysign (f32 fpimm_pos_zero), fp16vt:$src1),
+ (REG_SEQUENCE VGPR_32,
+ (V_MOV_B16_t16_e64 0, (i16 0), 0), lo16,
+ (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)), 0, VGPR_16:$src1), hi16)
+>;
+
def : GCNPat <
(fcopysign f64:$src0, fp16vt:$src1),
(REG_SEQUENCE VReg_64,
@@ -2330,6 +2379,13 @@ def : GCNPat <
(V_LSHRREV_B32_e64 (i32 16), $src1)), lo16)
>;
+// TODO: Scalar case for 0 magnitude special case
+def : GCNPat <
+ (fcopysign (fp16vt fpimm_pos_zero), f32:$src1),
+ (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)),
+ 0, (EXTRACT_SUBREG VGPR_32:$src1, hi16))
+>;
+
def : GCNPat <
(fcopysign fp16vt:$src0, f64:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
@@ -2347,6 +2403,16 @@ def : GCNPat <
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src0, $src1)
>;
+def : GCNPat <
+ (UniformBinFrag<fcopysign> build_vector_fpimm_pos_zero_v2<fp16vt>, fp16vt:$src1),
+ (S_AND_B32 (S_MOV_B32 (i32 0x80008000)), SReg_32:$src1)
+>;
+
+def : GCNPat <
+ (fcopysign build_vector_fpimm_pos_zero_v2<fp16vt>, fp16vt:$src1),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src1)
+>;
+
}
/********** ================== **********/
@@ -2675,12 +2741,34 @@ def : AMDGPUPat <
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1)
>;
+def : AMDGPUPat <
+ (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), (f32 SReg_32:$src1)),
+ (S_AND_B32 (S_MOV_B32 (i32 0x80000000)), $src1)
+>;
+
+def : AMDGPUPat <
+ (fcopysign (f32 fpimm_pos_zero), (f32 VGPR_32:$src1)),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x80000000)), $src1)
+>;
+
def : AMDGPUPat <
(fcopysign f32:$src0, f64:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
(i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
>;
+def : AMDGPUPat <
+ (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), SReg_64:$src1),
+ (S_AND_B32 (i32 (S_MOV_B32 (i32 0x80000000))),
+ (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
+>;
+
+def : AMDGPUPat <
+ (fcopysign (f32 fpimm_pos_zero), VReg_64:$src1),
+ (V_AND_B32_e32 (i32 (S_MOV_B32 (i32 0x80000000))),
+ (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
+>;
+
def : AMDGPUPat <
(fcopysign f64:$src0, f64:$src1),
(REG_SEQUENCE SReg_64,
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 1fb87bf2c80a0..cdec7545ac411 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -8109,35 +8109,31 @@ define bfloat @v_copysign_bf16_0_bf16(bfloat %sign) {
; GFX8-LABEL: v_copysign_bf16_0_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_bf16_0_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_bf16_0_bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_copysign_bf16_0_bf16:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 0x8000, v0.l
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_bf16_0_bf16:
; GFX11FAKE16: ; %bb.0:
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call bfloat @llvm.copysign.bf16(bfloat 0.0, bfloat %sign)
ret bfloat %op
@@ -8232,33 +8228,28 @@ define bfloat @v_copysign_bf16_0_f32(float %sign) {
; GFX8-LABEL: v_copysign_bf16_0_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x8000
+; GFX8-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_bf16_0_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x8000
+; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_bf16_0_f32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x8000
+; GFX10-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_copysign_bf16_0_f32:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 0x8000, v0.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_bf16_0_f32:
@@ -8266,7 +8257,7 @@ define bfloat @v_copysign_bf16_0_f32(float %sign) {
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc float %sign to bfloat
%op = call bfloat @llvm.copysign.bf16(bfloat 0.0, bfloat %sign.trunc)
@@ -8417,33 +8408,28 @@ define bfloat @v_copysign_bf16_0_f64(double %sign) {
; GFX8-LABEL: v_copysign_bf16_0_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT: s_movk_i32 s4, 0x7fff
-; GFX8-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, 0x8000
+; GFX8-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_bf16_0_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX9-NEXT: s_movk_i32 s4, 0x7fff
-; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000
+; GFX9-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_bf16_0_f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x8000
+; GFX10-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_copysign_bf16_0_f64:
; GFX11TRUE16: ; %bb.0:
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
-; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 0x8000, v1.h
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_bf16_0_f64:
@@ -8451,7 +8437,7 @@ define bfloat @v_copysign_bf16_0_f64(double %sign) {
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc double %sign to bfloat
%op = call bfloat @llvm.copysign.bf16(bfloat 0.0, bfloat %sign.trunc)
@@ -8489,23 +8475,17 @@ define amdgpu_ps i32 @s_copysign_v2bf16_0_v2bf16(<2 x bfloat> inreg %sign) {
;
; GFX9-LABEL: s_copysign_v2bf16_0_v2bf16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_bfi_b32 v0, s1, 0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_copysign_v2bf16_0_v2bf16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_v2bf16_0_v2bf16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX11-NEXT: ; return to shader part epilog
%op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign)
%cast = bitcast <2 x bfloat> %op to i32
@@ -8541,20 +8521,19 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf16(<2 x bfloat> %sign) {
; GFX9-LABEL: v_copysign_v2bf16_0_v2bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_v2bf16_0_v2bf16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_v2bf16_0_v2bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign)
ret <2 x bfloat> %op
@@ -8622,10 +8601,7 @@ define amdgpu_ps i32 @s_copysign_v2bf16_0_v2f32(<2 x float> inreg %sign) {
; GFX9-NEXT: s_cselect_b32 s0, s3, s4
; GFX9-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_bfi_b32 v0, s1, 0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_copysign_v2bf16_0_v2f32:
@@ -8647,8 +8623,7 @@ define amdgpu_ps i32 @s_copysign_v2bf16_0_v2f32(<2 x float> inreg %sign) {
; GFX10-NEXT: s_cselect_b32 s0, s0, s2
; GFX10-NEXT: s_lshr_b32 s0, s0, 16
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_v2bf16_0_v2f32:
@@ -8671,9 +8646,8 @@ define amdgpu_ps i32 @s_copysign_v2bf16_0_v2f32(<2 x float> inreg %sign) {
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s0, s0, 16
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX11-NEXT: ; return to shader part epilog
%sign.trunc = fptrunc <2 x float> %sign to <2 x bfloat>
%op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign.trunc)
@@ -8735,8 +8709,7 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf32(<2 x float> %sign) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-NEXT: s_mov_b32 s4, 0x7060302
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_v2bf16_0_v2bf32:
@@ -8753,7 +8726,7 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf32(<2 x float> %sign) {
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_copysign_v2bf16_0_v2bf32:
@@ -8772,7 +8745,7 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf32(<2 x float> %sign) {
; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
-; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v1
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0x80008000, v1
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_copysign_v2bf16_0_v2bf32:
@@ -8791,7 +8764,7 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf32(<2 x float> %sign) {
; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc <2 x float> %sign to <2 x bfloat>
%op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign.trunc)
@@ -8829,25 +8802,20 @@ define amdgpu_ps i32 @s_copysign_v2bf16_0_v2f64(<2 x double> inreg %sign) {
; GFX9-LABEL: s_copysign_v2bf16_0_v2f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s3
-; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_bfi_b32 v0, s1, 0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_copysign_v2bf16_0_v2f64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s1, s3
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_v2bf16_0_v2f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s1, s3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX11-NEXT: ; return to shader part epilog
%sign.trunc = fptrunc <2 x double> %sign to <2 x bfloat>
%op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign.trunc)
@@ -8893,15 +8861,14 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf64(<2 x double> %sign) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v3, v1, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT: v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_copysign_v2bf16_0_v2bf64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_perm_b32 v0, v3, v1, 0x5040100
-; GFX10-NEXT: v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_copysign_v2bf16_0_v2bf64:
@@ -8909,7 +8876,7 @@ de...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/172699
More information about the llvm-branch-commits
mailing list