[llvm-branch-commits] [llvm] AMDGPU: Improve v32f16/v32bf16 copysign handling (PR #142177)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri May 30 08:55:16 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
---
Patch is 46.39 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142177.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+688)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+307)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3535eb41682d9..1957e442dbabb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -759,7 +759,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// Can do this in one BFI plus a constant materialize.
setOperationAction(ISD::FCOPYSIGN,
{MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
- MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16},
+ MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
+ MVT::v32f16, MVT::v32bf16},
Custom);
setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
@@ -5943,7 +5944,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
+ VT == MVT::v32bf16);
auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 4bbd170529ad0..7c89a41d62fbf 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -2562,6 +2562,694 @@ define <16 x bfloat> @v_copysign_v16bf16(<16 x bfloat> %mag, <16 x bfloat> %sign
ret <16 x bfloat> %result
}
+define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign) {
+; GCN-LABEL: v_copysign_v32bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GCN-NEXT: v_bfe_u32 v32, v32, 16, 15
+; GCN-NEXT: v_and_b32_e32 v31, 0x8000, v31
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT: v_or_b32_e32 v31, v32, v31
+; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT: v_bfe_u32 v30, v30, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT: v_or_b32_e32 v30, v30, v32
+; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT: v_bfe_u32 v29, v29, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT: v_or_b32_e32 v29, v29, v32
+; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT: v_bfe_u32 v28, v28, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT: v_or_b32_e32 v28, v28, v32
+; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT: v_bfe_u32 v27, v27, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT: v_or_b32_e32 v27, v27, v32
+; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT: v_bfe_u32 v26, v26, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT: v_or_b32_e32 v26, v26, v32
+; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT: v_bfe_u32 v25, v25, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
+; GCN-NEXT: v_or_b32_e32 v25, v25, v32
+; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT: v_bfe_u32 v24, v24, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT: v_or_b32_e32 v24, v24, v32
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_bfe_u32 v23, v23, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT: v_or_b32_e32 v23, v23, v32
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_bfe_u32 v22, v22, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT: v_or_b32_e32 v22, v22, v32
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_bfe_u32 v21, v21, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GCN-NEXT: v_or_b32_e32 v21, v21, v32
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_bfe_u32 v20, v20, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT: v_or_b32_e32 v20, v20, v32
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_bfe_u32 v19, v19, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT: v_or_b32_e32 v19, v19, v32
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_bfe_u32 v18, v18, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT: v_or_b32_e32 v18, v18, v32
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_bfe_u32 v17, v17, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
+; GCN-NEXT: v_or_b32_e32 v17, v17, v32
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_bfe_u32 v16, v16, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT: v_or_b32_e32 v16, v16, v32
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_bfe_u32 v15, v15, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT: v_or_b32_e32 v15, v15, v32
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_bfe_u32 v14, v14, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT: v_or_b32_e32 v14, v14, v32
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_bfe_u32 v13, v13, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GCN-NEXT: v_or_b32_e32 v13, v13, v32
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_bfe_u32 v12, v12, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT: v_or_b32_e32 v12, v12, v32
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_bfe_u32 v11, v11, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT: v_or_b32_e32 v11, v11, v32
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_bfe_u32 v10, v10, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT: v_or_b32_e32 v10, v10, v32
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_bfe_u32 v9, v9, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GCN-NEXT: v_or_b32_e32 v9, v9, v32
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_bfe_u32 v8, v8, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT: v_or_b32_e32 v8, v8, v32
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_bfe_u32 v7, v7, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT: v_or_b32_e32 v7, v7, v32
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_bfe_u32 v6, v6, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT: v_or_b32_e32 v6, v6, v32
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_bfe_u32 v5, v5, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GCN-NEXT: v_or_b32_e32 v5, v5, v32
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT: v_or_b32_e32 v4, v4, v32
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT: v_or_b32_e32 v3, v3, v32
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_bfe_u32 v2, v2, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT: v_or_b32_e32 v2, v2, v32
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GCN-NEXT: v_or_b32_e32 v1, v1, v32
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
+; GCN-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GCN-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GCN-NEXT: v_or_b32_e32 v0, v0, v32
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GCN-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GCN-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GCN-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GCN-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GCN-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; GCN-NEXT: v_lshlrev_b32_e32 v16, 16, v16
+; GCN-NEXT: v_lshlrev_b32_e32 v17, 16, v17
+; GCN-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; GCN-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; GCN-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; GCN-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; GCN-NEXT: v_lshlrev_b32_e32 v22, 16, v22
+; GCN-NEXT: v_lshlrev_b32_e32 v23, 16, v23
+; GCN-NEXT: v_lshlrev_b32_e32 v24, 16, v24
+; GCN-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; GCN-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GCN-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GCN-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GCN-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GCN-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_copysign_v32bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT: v_bfe_u32 v30, v30, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT: v_bfe_u32 v29, v29, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT: v_bfe_u32 v28, v28, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT: v_bfe_u32 v27, v27, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT: v_bfe_u32 v26, v26, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT: v_bfe_u32 v25, v25, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT: v_bfe_u32 v24, v24, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_bfe_u32 v23, v23, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_bfe_u32 v22, v22, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT: v_bfe_u32 v21, v21, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT: v_bfe_u32 v20, v20, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT: v_bfe_u32 v19, v19, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT: v_bfe_u32 v18, v18, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT: v_bfe_u32 v17, v17, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT: v_bfe_u32 v16, v16, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_bfe_u32 v15, v15, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_bfe_u32 v14, v14, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_bfe_u32 v13, v13, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_bfe_u32 v12, v12, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_bfe_u32 v11, v11, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_bfe_u32 v10, v10, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_bfe_u32 v9, v9, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_bfe_u32 v8, v8, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_bfe_u32 v7, v7, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: v_bfe_u32 v32, v32, 16, 15
+; GFX7-NEXT: v_and_b32_e32 v31, 0x8000, v31
+; GFX7-NEXT: v_or_b32_e32 v31, v32, v31
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_lshlrev_b32_e32 v31, 16, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GFX7-NEXT: v_or_b32_e32 v30, v30, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GFX7-NEXT: v_or_b32_e32 v29, v29, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT: v_lshlrev_b32_e32 v29, 16, v29
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GFX7-NEXT: v_or_b32_e32 v28, v28, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GFX7-NEXT: v_or_b32_e32 v27, v27, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0x8000, v32
+; GFX7-NEXT: v_or_b32_e32 v26, v26, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; GFX7-NEXT: s_w...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/142177
More information about the llvm-branch-commits
mailing list