[llvm] [AMDGPU] Regenerate checks for test/CodeGen/AMDGPU/bf16.ll (PR #161069)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 28 03:44:06 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (macurtis-amd)
<details>
<summary>Changes</summary>
Looks like there were some checks leftover from before the GFX1250TRUE16 run line was disabled. These were causing problems downstream. Not sure why update_llc_test_checks did not clean these up.
I removed all existing checks and re-ran update_llc_test_checks.
---
Patch is 238.40 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161069.diff
1 Files Affected:
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+4659-83)
``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 0490e5a19b4b7..94ba5cdd09df4 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -10908,12 +10908,13 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1250-LABEL: v_fadd_v2bf16:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v1
-; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fadd <2 x bfloat> %a, %b
ret <2 x bfloat> %op
}
@@ -11446,13 +11447,14 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
; GFX1250-LABEL: v_fadd_v4bf16:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v2
-; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v3
-; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v2
+; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fadd <4 x bfloat> %a, %b
ret <4 x bfloat> %op
}
@@ -49991,6 +49993,622 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
ret <4 x bfloat> %op
}
+define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) {
+; GCN-LABEL: v_fma_v8bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_fma_f32 v7, v7, v15, v23
+; GCN-NEXT: v_fma_f32 v6, v6, v14, v22
+; GCN-NEXT: v_fma_f32 v5, v5, v13, v21
+; GCN-NEXT: v_fma_f32 v4, v4, v12, v20
+; GCN-NEXT: v_fma_f32 v3, v3, v11, v19
+; GCN-NEXT: v_fma_f32 v2, v2, v10, v18
+; GCN-NEXT: v_fma_f32 v1, v1, v9, v17
+; GCN-NEXT: v_fma_f32 v0, v0, v8, v16
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fma_v8bf16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: v_fma_f32 v7, v7, v15, v23
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_fma_f32 v6, v6, v14, v15
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21
+; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_fma_f32 v5, v5, v13, v14
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20
+; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_fma_f32 v4, v4, v12, v13
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19
+; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_fma_f32 v3, v3, v11, v12
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v18
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_fma_f32 v2, v2, v10, v11
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v17
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v16
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_fma_f32 v1, v1, v9, v11
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_fma_f32 v0, v0, v8, v9
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fma_v8bf16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v3
+; GFX8-NEXT: v_fma_f32 v12, v14, v13, v12
+; GFX8-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v12
+; GFX8-NEXT: s_movk_i32 s4, 0x7fff
+; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13
+; GFX8-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX8-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v2
+; GFX8-NEXT: v_fma_f32 v7, v13, v11, v7
+; GFX8-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v7
+; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11
+; GFX8-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX8-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v10, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX8-NEXT: v_fma_f32 v6, v11, v10, v6
+; GFX8-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v6
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX8-NEXT: v_fma_f32 v5, v10, v9, v5
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9
+; GFX8-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16
+; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16
+; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_fma_v8bf16:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v3
+; GFX900-NEXT: v_fma_f32 v12, v14, v13, v12
+; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX900-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX900-NEXT: s_movk_i32 s4, 0x7fff
+; GFX900-NEXT: v_fma_f32 v3, v3, v7, v11
+; GFX900-NEXT: v_add3_u32 v13, v13, v12, s4
+; GFX900-NEXT: v_or_b32_e32 v14, 0x400000, v12
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12
+; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc
+; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4
+; GFX900-NEXT: v_or_b32_e32 v11, 0x400000, v3
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v11, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v10
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v6
+; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v2
+; GFX900-NEXT: v_fma_f32 v7, v13, v11, v7
+; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
+; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT: v_bfe_u32 v11, v7, 16, 1
+; GFX900-NEXT: v_fma_f32 v2, v2, v6, v10
+; GFX900-NEXT: v_add3_u32 v11, v11, v7, s4
+; GFX900-NEXT: v_or_b32_e32 v13, 0x400000, v7
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7
+; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc
+; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v2
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v10, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v1
+; GFX900-NEXT: v_fma_f32 v6, v11, v10, v6
+; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX900-NEXT: v_bfe_u32 v10, v6, 16, 1
+; GFX900-NEXT: v_fma_f32 v1, v1, v5, v9
+; GFX900-NEXT: v_add3_u32 v10, v10, v6, s4
+; GFX900-NEXT: v_or_b32_e32 v11, 0x400000, v6
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc
+; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4
+; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
+; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v8
+; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v4
+; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v0
+; GFX900-NEXT: v_fma_f32 v5, v10, v9, v5
+; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
+; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
+; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT: v_fma_f32 v0, v0, v4, v8
+; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4
+; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4
+; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x7060302
+; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4
+; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4
+; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4
+; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fma_v8bf16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11
+; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v7
+; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v3
+; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX950-NEXT: v_fmac_f32_e32 v12, v14, v13
+; GFX950-NEXT: v_fmac_f32_e32 v11, v3, v7
+; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v10
+; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
+; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v2
+; GFX950-NEXT: v_fmac_f32_e32 v3, v13, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v10
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX950-NEXT: v_fmac_f32_e32 v7, v2, v6
+; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
+; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v1
+; GFX950-NEXT: v_fmac_f32_e32 v2, v10, v6
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v9
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: v_fmac_f32_e32 v6, v1, v5
+; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
+; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0
+; GFX950-NEXT: v_fmac_f32_e32 v1, v9, v5
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v8
+; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX950-NEXT: v_fmac_f32_e32 v5, v0, v4
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v6, v2
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v7, v3
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v11, v12
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fma_v8bf16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11
+; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v0
+; GFX10-NEXT: v_fmac_f32_e32 v12, v14, v13
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_fmac_f32_e32 v11, v3, v7
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v10
+; GFX10-NEXT: v_bfe_u32 v13, v12, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v12
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX10-NEXT: v_add3_u32 v13, v13, v12, 0x7fff
+; GFX10-NEXT: v_fmac_f32_e32 v3, v14, v7
+; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v10
+; GFX10-NEXT: v_bfe_u32 v16, v11, 16, 1
+; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e32 v10, v13, v15, vcc_lo
+; GFX10-NEXT: v_bfe_u32 v13, v3, 16, 1
+; GFX10-NEXT: v_fmac_f32_e32 v7, v2, v6
+; GFX10-NEXT: v_add3_u32 v12, v16, v11, 0x7fff
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v9
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX10-NEXT: v_add3_u32 v13, v13, v3, 0x7fff
+; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v3
+; GFX10-NEXT: v_bfe_u32 v16, v7, 16, 1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_fmac_f32_e32 v2, v14, v6
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
+; GFX10-NEXT: v_add3_u32 v6, v16, v7, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v15, vcc_lo
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v8
+; GFX10-NEXT: v_...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/161069
More information about the llvm-commits
mailing list