[llvm] [AMDGPU] fcanonicalize.bf16.ll - regenerate test checks (PR #161026)

via llvm-commits llvm-commits at lists.llvm.org
Sat Sep 27 14:39:40 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Simon Pilgrim (RKSimon)

<details>
<summary>Changes</summary>



---

Patch is 111.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/161026.diff


1 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll (+950-895) 


``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll
index a4cdb0387df9a..d747fb7cce7dc 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll
@@ -15,67 +15,67 @@ declare <32 x bfloat> @llvm.canonicalize.v32bf16(<32 x bfloat>) #0
 declare <64 x bfloat> @llvm.canonicalize.v64bf16(<64 x bfloat>) #0
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
-; GFX1250-LABEL:     test_fold_canonicalize_undef_value_bf16:
-; GFX1250:           %bb.0:
-; GFX1250-NEXT:        s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:        v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:        s_wait_kmcnt 0x0
-; GFX1250-NEXT:        global_store_b16 v0, v0, s[0:1]
-; GFX1250-NEXT:        s_endpgm
 define amdgpu_kernel void @test_fold_canonicalize_undef_value_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_undef_value_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat undef)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
 
-; GFX1250-LABEL:    v_test_canonicalize_var_bf16:
-; GFX1250:          ; %bb.0:
-; GFX1250-NEXT:       s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:       v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:       s_wait_kmcnt 0x0
-; GFX1250-NEXT:       global_load_u16 v0, v0, s[0:1]
-; GFX1250-NEXT:       s_wait_loadcnt 0x0
-; GFX1250-NEXT:       v_lshlrev_b32_e32 v0, 16, v0
-; GFX1250-NEXT:       s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:       v_max_num_f32_e32 v0, v0, v0
-; GFX1250-NEXT:       v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:       global_store_b16 v[0:1], v0, off
-; GFX1250-NEXT:       s_endpgm
 define amdgpu_kernel void @v_test_canonicalize_var_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: v_test_canonicalize_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v0, v0, s[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_max_num_f32_e32 v0, v0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %out
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val)
   store bfloat %canonicalized, ptr addrspace(1) poison
   ret void
 }
 
-; GFX1250-LABEL:     s_test_canonicalize_var_bf16:
-; GFX1250:           ; %bb.0:
-; GFX1250-NEXT:        s_load_b96 s[0:2], s[4:5], 0x24
-; GFX1250-NEXT:        v_mov_b32_e32 v1, 0
-; GFX1250-NEXT:        s_wait_kmcnt 0x0
-; GFX1250-NEXT:        s_lshl_b32 s2, s2, 16
-; GFX1250-NEXT:        s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:        v_max_num_f32_e64 v0, s2, s2
-; GFX1250-NEXT:        v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:        global_store_b16 v1, v0, s[0:1]
-; GFX1250-NEXT:        s_endpgm
 define amdgpu_kernel void @s_test_canonicalize_var_bf16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 {
+; GFX1250-LABEL: s_test_canonicalize_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_max_num_f32_e64 v0, s2, s2
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
+; GFX1250-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = bitcast i16 %val.arg to bfloat
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
 
-; GFX1250-LABEL:    v_test_canonicalize_build_vector_v2bf16:
-; GFX1250:          ; %bb.0:
-; GFX1250-NEXT:       s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT:       s_wait_kmcnt 0x0
-; GFX1250-NEXT:       v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX1250-NEXT:       s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:       v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
-; GFX1250-NEXT:       v_cvt_pk_bf16_f32 v0, v0, v1
-; GFX1250-NEXT:       s_set_pc_i64 s[30:31]
 define <2 x bfloat> @v_test_canonicalize_build_vector_v2bf16(bfloat %lo, bfloat %hi) #1 {
+; GFX1250-LABEL: v_test_canonicalize_build_vector_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %ins0 = insertelement <2 x bfloat> poison, bfloat %lo, i32 0
   %ins1 = insertelement <2 x bfloat> %ins0, bfloat %hi, i32 1
   %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %ins1)
@@ -83,22 +83,22 @@ define <2 x bfloat> @v_test_canonicalize_build_vector_v2bf16(bfloat %lo, bfloat
 }
 
 
-; GFX1250-LABEL:     v_test_canonicalize_fabs_var_bf16:
-; GFX1250:           ; %bb.0:
-; GFX1250-NEXT:         s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:         v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:         s_wait_kmcnt 0x0
-; GFX1250-NEXT:         global_load_u16 v1, v0, s[0:1]
-; GFX1250-NEXT:         s_wait_loadcnt 0x0
-; GFX1250-NEXT:         v_and_b32_e32 v1, 0x7fff, v1
-; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:         v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT:         v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:         v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:         global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT:         s_endpgm
 define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: v_test_canonicalize_fabs_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %out
   %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val)
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs)
@@ -107,22 +107,22 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %o
 }
 
 
-; GFX1250-LABEL:     v_test_canonicalize_fneg_fabs_var_bf16:
-; GFX1250:           ; %bb.0:
-; GFX1250-NEXT:        s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:        v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:        s_wait_kmcnt 0x0
-; GFX1250-NEXT:        global_load_u16 v1, v0, s[0:1]
-; GFX1250-NEXT:        s_wait_loadcnt 0x0
-; GFX1250-NEXT:        v_or_b32_e32 v1, 0x8000, v1
-; GFX1250-NEXT:        s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:        v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT:        v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT:        s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:        v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:        global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT:        s_endpgm
 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %out
   %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val)
   %val.fabs.fneg = fneg bfloat %val.fabs
@@ -131,22 +131,22 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace(
   ret void
 }
 
-; GFX1250-LABEL:    v_test_canonicalize_fneg_var_bf16:
-; GFX1250:          ; %bb.0:
-; GFX1250-NEXT:        s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:        v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:        s_wait_kmcnt 0x0
-; GFX1250-NEXT:        global_load_u16 v1, v0, s[0:1]
-; GFX1250-NEXT:        s_wait_loadcnt 0x0
-; GFX1250-NEXT:        v_xor_b32_e32 v1, 0x8000, v1
-; GFX1250-NEXT:        s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:        v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT:        v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT:        s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:        v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:        global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT:        s_endpgm
 define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: v_test_canonicalize_fneg_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %out
   %val.fneg = fneg bfloat %val
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg)
@@ -154,22 +154,22 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %o
   ret void
 }
 
-; GFX1250-LABEL:      v_test_no_denormals_canonicalize_fneg_var_bf16:
-; GFX1250:            ; %bb.0:
-; GFX1250-NEXT:         s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:         v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:         s_wait_kmcnt 0x0
-; GFX1250-NEXT:         global_load_u16 v1, v0, s[0:1]
-; GFX1250-NEXT:         s_wait_loadcnt 0x0
-; GFX1250-NEXT:         v_xor_b32_e32 v1, 0x8000, v1
-; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:         v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT:         v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:         v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:         global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT:         s_endpgm
 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #2 {
+; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %out
   %val.fneg = fneg bfloat %val
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg)
@@ -177,22 +177,22 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr ad
   ret void
 }
 
-; GFX1250-LABEL:      v_test_no_denormals_canonicalize_fneg_fabs_var_bf16:
-; GFX1250: ;          %bb.0:
-; GFX1250-NEXT:         s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT:         v_mov_b32_e32 v0, 0
-; GFX1250-NEXT:         s_wait_kmcnt 0x0
-; GFX1250-NEXT:         global_load_u16 v1, v0, s[0:1]
-; GFX1250-NEXT:         s_wait_loadcnt 0x0
-; GFX1250-NEXT:         v_or_b32_e32 v1, 0x8000, v1
-; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:         v_lshlrev_b32_e32 v1, 16, v1
-; GFX1250-NEXT:         v_max_num_f32_e32 v1, v1, v1
-; GFX1250-NEXT:         s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:         v_cvt_pk_bf16_f32 v1, v1, s0
-; GFX1250-NEXT:         global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT:         s_endpgm
 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #2 {
+; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-NEXT:    v_max_num_f32_e32 v1, v1, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %val = load bfloat, ptr addrspace(1) %out
   %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val)
   %val.fabs.fneg = fneg bfloat %val.fabs
@@ -201,217 +201,231 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(p
   ret void
 }
 
+define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_p0_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v0, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
- define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
    %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0.0)
    store bfloat %canonicalized, ptr addrspace(1) %out
    ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_n0_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
-; GFX1250-NEXT: .Lfunc_end10:
+
 define amdgpu_kernel void @test_fold_canonicalize_n0_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_n0_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -0.0)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_p1_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f80
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_p1_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_p1_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f80
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 1.0)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_n1_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbf80
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
-; GFX1250-NEXT: .Lfunc_end12:
+
 define amdgpu_kernel void @test_fold_canonicalize_n1_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_n1_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbf80
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -1.0)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_fold_canonicalize_literal_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4180
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_fold_canonicalize_literal_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_fold_canonicalize_literal_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4180
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 16.0)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24
-; GFX1250-NEXT: 	v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
-; GFX1250-NEXT: 	s_wait_kmcnt 0x0
-; GFX1250-NEXT: 	global_store_b16 v0, v1, s[0:1]
-; GFX1250-NEXT: 	s_endpgm
+
 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #1 {
+; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF)
   store bfloat %canonicalized, ptr addrspace(1) %out
   ret void
 }
-; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_bf16:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: 	s_load_b64 s[0:1], s[4:5], 0x24...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/161026


More information about the llvm-commits mailing list