[llvm] [AMDGPU][True16][CodeGen] enable true16 for more codegen test patch 2 (PR #131210)

via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 13 13:48:22 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

<details>
<summary>Changes</summary>

This is a NFC patch.

Enable true16 mode for more CodeGen tests

---

Patch is 972.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131210.diff


23 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll (+129-61) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll (+315-155) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll (+875-414) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll (+1405-356) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+753-383) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll (+753-383) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll (+141-66) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll (+82-39) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll (+129-61) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll (+84-40) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll (+82-39) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+261-123) 
- (modified) llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll (+1567-396) 
- (modified) llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll (+2437-651) 
- (modified) llvm/test/CodeGen/AMDGPU/select.f16.ll (+1315-668) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll (+116-40) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll (+94-43) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll (+156-42) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll (+334-29) 
- (modified) llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll (+233-123) 
- (modified) llvm/test/CodeGen/AMDGPU/sub.v2i16.ll (+49-23) 
- (modified) llvm/test/CodeGen/AMDGPU/v_madak_f16.ll (+126-61) 
- (modified) llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll (+524-261) 


``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 867025adca944..644c88457714b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -3,8 +3,10 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX6-LABEL: cos_f16:
@@ -69,31 +71,57 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: cos_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cos_f16_e32 v1, v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: cos_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.15915494, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cos_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: cos_f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cos_f16_e32 v1, v1
-; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX11-FAKE16-LABEL: cos_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cos_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: cos_f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.15915494, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cos_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: cos_f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cos_f16_e32 v1, v1
+; GFX12-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %a.val = load half, ptr addrspace(1) %a
   %r.val = call half @llvm.cos.f16(half %a.val)
   store half %r.val, ptr addrspace(1) %r
@@ -184,42 +212,79 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: cos_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX11-NEXT:    v_cos_f16_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cos_f16_e32 v2, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: cos_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_b32 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.15915494, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0.15915494, v2.l
+; GFX11-TRUE16-NEXT:    v_cos_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cos_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: cos_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX11-FAKE16-NEXT:    v_cos_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cos_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX11-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: cos_v2f16:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_b32 v2, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0.15915494, v2.l
+; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.h, 0.15915494, v2.l
+; GFX12-TRUE16-NEXT:    ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cos_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    v_cos_f16_e32 v0.h, v0.h
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
+; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: cos_v2f16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX12-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
-; GFX12-NEXT:    v_cos_f16_e32 v1, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX12-NEXT:    v_cos_f16_e32 v2, v2
-; GFX12-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-FAKE16-LABEL: cos_v2f16:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX12-FAKE16-NEXT:    v_mul_f16_e32 v1, 0.15915494, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_mul_f16_e32 v2, 0.15915494, v2
+; GFX12-FAKE16-NEXT:    v_cos_f16_e32 v1, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cos_f16_e32 v2, v2
+; GFX12-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX12-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %a.val = load <2 x half>, ptr addrspace(1) %a
   %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val)
   store <2 x half> %r.val, ptr addrspace(1) %r
@@ -228,3 +293,6 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 
 declare half @llvm.cos.f16(half %a)
 declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX11: {{.*}}
+; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 814f44477f528..61991c8b409dd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -5,8 +5,10 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-DENORM %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-FLUSH %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DENORM %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH,GFX11-FLUSH-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH,GFX11-FLUSH-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-DENORM,GFX11-DENORM-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-DENORM,GFX11-DENORM-FAKE16 %s
 
 declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
@@ -160,64 +162,122 @@ define amdgpu_kernel void @fmuladd_f16(
 ; GFX10-DENORM-NEXT:    buffer_store_short v2, off, s[0:3], 0
 ; GFX10-DENORM-NEXT:    s_endpgm
 ;
-; GFX11-FLUSH-LABEL: fmuladd_f16:
-; GFX11-FLUSH:       ; %bb.0:
-; GFX11-FLUSH-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-FLUSH-NEXT:    s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT:    s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT:    s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT:    s_mov_b32 s18, s10
-; GFX11-FLUSH-NEXT:    s_mov_b32 s19, s11
-; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT:    s_mov_b32 s12, s2
-; GFX11-FLUSH-NEXT:    s_mov_b32 s13, s3
-; GFX11-FLUSH-NEXT:    s_mov_b32 s16, s4
-; GFX11-FLUSH-NEXT:    s_mov_b32 s17, s5
-; GFX11-FLUSH-NEXT:    buffer_load_u16 v0, off, s[12:15], 0
-; GFX11-FLUSH-NEXT:    buffer_load_u16 v1, off, s[16:19], 0
-; GFX11-FLUSH-NEXT:    s_mov_b32 s4, s6
-; GFX11-FLUSH-NEXT:    s_mov_b32 s5, s7
-; GFX11-FLUSH-NEXT:    s_mov_b32 s6, s10
-; GFX11-FLUSH-NEXT:    s_mov_b32 s7, s11
-; GFX11-FLUSH-NEXT:    s_mov_b32 s8, s0
-; GFX11-FLUSH-NEXT:    buffer_load_u16 v2, off, s[4:7], 0
-; GFX11-FLUSH-NEXT:    s_mov_b32 s9, s1
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT:    v_add_f16_e32 v0, v0, v2
-; GFX11-FLUSH-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-FLUSH-NEXT:    s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fmuladd_f16:
+; GFX11-FLUSH-TRUE16:       ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s18, s10
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s19, s11
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s12, s2
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s13, s3
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s16, s4
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s17, s5
+; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0
+; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s12, s6
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s13, s7
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s8, s0
+; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_u16 v2, off, s[12:15], 0
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s9, s1
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-FLUSH-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
-; GFX11-DENORM-LABEL: fmuladd_f16:
-; GFX11-DENORM:       ; %bb.0:
-; GFX11-DENORM-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-NEXT:    s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT:    s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT:    s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT:    s_mov_b32 s18, s10
-; GFX11-DENORM-NEXT:    s_mov_b32 s19, s11
-; GFX11-DENORM-NEXT:    s_mov_b32 s22, s10
-; GFX11-DENORM-NEXT:    s_mov_b32 s23, s11
-; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT:    s_mov_b32 s12, s2
-; GFX11-DENORM-NEXT:    s_mov_b32 s13, s3
-; GFX11-DENORM-NEXT:    s_mov_b32 s16, s4
-; GFX11-DENORM-NEXT:    s_mov_b32 s17, s5
-; GFX11-DENORM-NEXT:    s_mov_b32 s20, s6
-; GFX11-DENORM-NEXT:    s_mov_b32 s21, s7
-; GFX11-DENORM-NEXT:    buffer_load_u16 v0, off, s[12:15], 0
-; GFX11-DENORM-NEXT:    buffer_load_u16 v1, off, s[16:19], 0
-; GFX11-DENORM-NEXT:    buffer_load_u16 v2, off, s[20:23], 0
-; GFX11-DENORM-NEXT:    s_mov_b32 s8, s0
-; GFX11-DENORM-NEXT:    s_mov_b32 s9, s1
-; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, v0, v1
-; GFX11-DENORM-NEXT:    buffer_store_b16 v2, off, s[8:11], 0
-; GFX11-DENORM-NEXT:    s_endpgm
+; GFX11-FLUSH-FAKE16-LABEL: fmuladd_f16:
+; GFX11-FLUSH-FAKE16:       ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s18, s10
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s19, s11
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s12, s2
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s13, s3
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s16, s4
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s17, s5
+; GFX11-FLUSH-FAKE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0
+; GFX11-FLUSH-FAKE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s4, s6
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s5, s7
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s6, s10
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s7, s11
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s8, s0
+; GFX11-FLUSH-FAKE16-NEXT:    buffer_load_u16 v2, off, s[4:7], 0
+; GFX11-FLUSH-FAKE16-NEXT:    s_mov_b32 s9, s1
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FLUSH-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-FLUSH-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-FLUSH-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-FAKE16-NEXT:    s_endpgm
+;
+; GFX11-DENORM-TRUE16-LABEL: fmuladd_f16:
+; GFX11-DENORM-TRUE16:       ; %bb.0:
+; GFX11-DENORM-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s18, s10
+; GFX11-DENORM-TRUE16-NEXT:    s...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/131210


More information about the llvm-commits mailing list