[llvm] f9ab235 - [AMDGPU] Autogenerate the fmuladd.f16.ll and llvm.fmuladd.f16.ll codegen tests.

Ivan Kosarev via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 16 04:49:50 PDT 2023


Author: Ivan Kosarev
Date: 2023-08-16T12:49:45+01:00
New Revision: f9ab23531831ba578cd26b789acef98a2151f892

URL: https://github.com/llvm/llvm-project/commit/f9ab23531831ba578cd26b789acef98a2151f892
DIFF: https://github.com/llvm/llvm-project/commit/f9ab23531831ba578cd26b789acef98a2151f892.diff

LOG: [AMDGPU] Autogenerate the fmuladd.f16.ll and llvm.fmuladd.f16.ll codegen tests.

Reviewed By: Joe_Nash

Differential Revision: https://reviews.llvm.org/D157966

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 198c0cac5cb47b..4a2f4c4437203b 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -1,33 +1,129 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI-FLUSH,VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI-FLUSH,VI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-FLUSH %s
 
-; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM,VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-DENORM %s
+; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=VI-DENORM,VI-DENORM-CONTRACT %s
 
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-STRICT %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-CONTRACT %s
 
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-FLUSH,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM,GFX11-DENORM-STRICT %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM,GFX11-DENORM-CONTRACT %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare half @llvm.fmuladd.f16(half, half, half) #1
 declare half @llvm.fabs.f16(half) #1
 
-; GCN-LABEL: {{^}}fmuladd_f16:
-; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
-; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
-; GFX10-FLUSH:  v_mul_f16_e32
-; GFX10-FLUSH:  v_add_f16_e32
-; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
 define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
+; VI-FLUSH-LABEL: fmuladd_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v2, s4
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v3, s5
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v4, s6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v5, s7
+; VI-FLUSH-NEXT:    flat_load_ushort v6, v[0:1]
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3]
+; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5]
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v6, v2
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-LABEL: fmuladd_f16:
+; VI-DENORM:       ; %bb.0:
+; VI-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s2
+; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT:    v_mov_b32_e32 v2, s4
+; VI-DENORM-NEXT:    v_mov_b32_e32 v3, s5
+; VI-DENORM-NEXT:    v_mov_b32_e32 v4, s6
+; VI-DENORM-NEXT:    v_mov_b32_e32 v5, s7
+; VI-DENORM-NEXT:    flat_load_ushort v6, v[0:1]
+; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3]
+; VI-DENORM-NEXT:    flat_load_ushort v3, v[4:5]
+; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s0
+; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    v_fma_f16 v2, v6, v2, v3
+; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmuladd_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    s_clause 0x2
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-LABEL: fmuladd_f16:
+; GFX10-DENORM:       ; %bb.0:
+; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT:    s_clause 0x2
+; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
+; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmuladd_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    s_clause 0x2
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-LABEL: fmuladd_f16:
+; GFX11-DENORM:       ; %bb.0:
+; GFX11-DENORM-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-NEXT:    s_clause 0x2
+; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX11-DENORM-NEXT:    global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
+; GFX11-DENORM-NEXT:    global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-NEXT:    s_nop 0
+; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-NEXT:    s_endpgm
                          ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
   %r0 = load half, ptr addrspace(1) %in1
   %r1 = load half, ptr addrspace(1) %in2
@@ -37,16 +133,146 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}fmul_fadd_f16:
-; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
-; VI-DENORM-CONTRACT: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
-; GFX10-FLUSH:  v_mul_f16_e32
-; GFX10-FLUSH:  v_add_f16_e32
-; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
 define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
+; VI-FLUSH-LABEL: fmul_fadd_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v2, s4
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v3, s5
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v4, s6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v5, s7
+; VI-FLUSH-NEXT:    flat_load_ushort v6, v[0:1]
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3]
+; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5]
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v6, v2
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
+; VI-DENORM-CONTRACT:       ; %bb.0:
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, s2
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v2, s4
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v3, s5
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v4, s6
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v5, s7
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v6, v[0:1]
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3]
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5]
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, s0
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v6, v2, v3
+; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmul_fadd_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    s_clause 0x2
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16:
+; GFX10-DENORM-STRICT:       ; %bb.0:
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    s_clause 0x2
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
+; GFX10-DENORM-CONTRACT:       ; %bb.0:
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    s_clause 0x2
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmul_fadd_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    s_clause 0x2
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16:
+; GFX11-DENORM-STRICT:       ; %bb.0:
+; GFX11-DENORM-STRICT-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    s_clause 0x2
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    s_nop 0
+; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
+; GFX11-DENORM-CONTRACT:       ; %bb.0:
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    s_clause 0x2
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
+; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
                          ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
   %r0 = load half, ptr addrspace(1) %in1
   %r1 = load half, ptr addrspace(1) %in2
@@ -57,16 +283,111 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
   ret void
 }
 
-; GCN-LABEL: {{^}}fmul_fadd_contract_f16:
-; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
-; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
-; GFX10-FLUSH:  v_mul_f16_e32
-; GFX10-FLUSH:  v_add_f16_e32
-; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-
 define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
+; VI-FLUSH-LABEL: fmul_fadd_contract_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v2, s4
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v3, s5
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v4, s6
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v5, s7
+; VI-FLUSH-NEXT:    flat_load_ushort v6, v[0:1]
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3]
+; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5]
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v6, v2
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-LABEL: fmul_fadd_contract_f16:
+; VI-DENORM:       ; %bb.0:
+; VI-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s2
+; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT:    v_mov_b32_e32 v2, s4
+; VI-DENORM-NEXT:    v_mov_b32_e32 v3, s5
+; VI-DENORM-NEXT:    v_mov_b32_e32 v4, s6
+; VI-DENORM-NEXT:    v_mov_b32_e32 v5, s7
+; VI-DENORM-NEXT:    flat_load_ushort v6, v[0:1]
+; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3]
+; VI-DENORM-NEXT:    flat_load_ushort v3, v[4:5]
+; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s0
+; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    v_fma_f16 v2, v6, v2, v3
+; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    s_clause 0x2
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-LABEL: fmul_fadd_contract_f16:
+; GFX10-DENORM:       ; %bb.0:
+; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT:    s_clause 0x2
+; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[4:5]
+; GFX10-DENORM-NEXT:    global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
+; GFX10-DENORM-NEXT:    global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    s_clause 0x2
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v3
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-LABEL: fmul_fadd_contract_f16:
+; GFX11-DENORM:       ; %bb.0:
+; GFX11-DENORM-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-NEXT:    s_clause 0x2
+; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[4:5]
+; GFX11-DENORM-NEXT:    global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v3, v1, v2
+; GFX11-DENORM-NEXT:    global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-NEXT:    s_nop 0
+; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-NEXT:    s_endpgm
                          ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
   %r0 = load half, ptr addrspace(1) %in1
   %r1 = load half, ptr addrspace(1) %in2
@@ -77,23 +398,101 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
   ret void
 }
 
-; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]],
-; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
-; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; VI-DENORM:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
-; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]]
-
-; GFX10-FLUSH:  v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
-; GFX10-FLUSH:  v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
-
-; VI-DENORM:    flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]]
-; GFX10-FLUSH:  global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}}
-
 define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16:
+; VI-DENORM:       ; %bb.0:
+; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    v_fma_f16 v2, v4, 2.0, v2
+; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16:
+; GFX10-DENORM:       ; %bb.0:
+; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
+; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16:
+; GFX11-DENORM:       ; %bb.0:
+; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT:    s_nop 0
+; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -107,23 +506,101 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
   ret void
 }
 
-; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]],
-; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
-; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; VI-DENORM:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
-; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
-
-; GFX10-FLUSH:  v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
-; GFX10-FLUSH:  v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
-
-; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]]
-; GFX10-FLUSH:  global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
-
 define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16:
+; VI-DENORM:       ; %bb.0:
+; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    v_fma_f16 v2, v4, 2.0, v2
+; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16:
+; GFX10-DENORM:       ; %bb.0:
+; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
+; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16:
+; GFX11-DENORM:       ; %bb.0:
+; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT:    s_nop 0
+; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -137,27 +614,132 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_a_a_b_f16:
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]],
-; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
-; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
-; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
-
-; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
-
-; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GFX10-FLUSH:           v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
-; GFX10-FLUSH:           v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
-; GFX10-FLUSH:           global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
-; GFX10-DENORM-STRICT:   global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
-; GFX10-DENORM-CONTRACT: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]]
-
 define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
+; VI-FLUSH-LABEL: fadd_a_a_b_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
+; VI-DENORM-CONTRACT:       ; %bb.0:
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v4, 2.0, v2
+; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fadd_a_a_b_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
+; GFX10-DENORM-STRICT:       ; %bb.0:
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v2
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
+; GFX10-DENORM-CONTRACT:       ; %bb.0:
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fadd_a_a_b_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
+; GFX11-DENORM-STRICT:       ; %bb.0:
+; GFX11-DENORM-STRICT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    s_nop 0
+; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
+; GFX11-DENORM-CONTRACT:       ; %bb.0:
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
+; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
                             ptr addrspace(1) %in1,
                             ptr addrspace(1) %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -174,27 +756,132 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_b_a_a_f16:
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]],
-; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
-; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
-; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
-
-; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]],  [[R2]], [[TMP]]
-
-; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
-; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
-; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
-; GFX10-DENORM-STRICT:   global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
-; GFX10-DENORM-CONTRACT: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]]
-
 define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
+; VI-FLUSH-LABEL: fadd_b_a_a_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
+; VI-DENORM-CONTRACT:       ; %bb.0:
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v4, 2.0, v2
+; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fadd_b_a_a_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
+; GFX10-DENORM-STRICT:       ; %bb.0:
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v2, v1
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
+; GFX10-DENORM-CONTRACT:       ; %bb.0:
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fadd_b_a_a_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
+; GFX11-DENORM-STRICT:       ; %bb.0:
+; GFX11-DENORM-STRICT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v2, v1
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    s_nop 0
+; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
+; GFX11-DENORM-CONTRACT:       ; %bb.0:
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
+; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
                             ptr addrspace(1) %in1,
                             ptr addrspace(1) %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -211,19 +898,101 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
   ret void
 }
 
-; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]],
-; VI-FLUSH:     v_mac_f16_e32 [[R2]], -2.0, [[R1]]
-; VI-DENORM:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
-; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
-; VI-FLUSH:  flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
-; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
-; GFX10-FLUSH:  global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
-; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]]
 define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, -2.0, v4
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
+; VI-DENORM:       ; %bb.0:
+; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    v_fma_f16 v2, v4, -2.0, v2
+; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
+; GFX10-DENORM:       ; %bb.0:
+; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
+; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
+; GFX11-DENORM:       ; %bb.0:
+; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
+; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT:    s_nop 0
+; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -237,22 +1006,101 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
   ret void
 }
 
-; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]],
-; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
-; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; VI-DENORM:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
-; VI-DENORM:  flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
-; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
-; GFX10-FLUSH:  global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
-
-; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
-; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]]
 define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, 2.0, v4
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
+; VI-DENORM:       ; %bb.0:
+; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    v_fma_f16 v2, v4, 2.0, v2
+; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
+; GFX10-DENORM:       ; %bb.0:
+; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
+; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v2, v1
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
+; GFX11-DENORM:       ; %bb.0:
+; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT:    s_nop 0
+; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -268,22 +1116,101 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
   ret void
 }
 
-; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]],
-; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
-; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
-; VI-DENORM:  flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
-; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
-; GFX10-FLUSH: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
-
-; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
-; GFX10-DENORM: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]]
 define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, -2.0, v4
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
+; VI-DENORM:       ; %bb.0:
+; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    v_fma_f16 v2, v4, -2.0, v2
+; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
+; GFX10-DENORM:       ; %bb.0:
+; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
+; GFX10-DENORM-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
+; GFX11-DENORM:       ; %bb.0:
+; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
+; GFX11-DENORM-NEXT:    global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT:    s_nop 0
+; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -299,16 +1226,101 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
   ret void
 }
 
-; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]],
-; VI-FLUSH:   v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
-; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
-; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
-; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
-; GFX10:       global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
 define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mad_f16 v2, v4, 2.0, -v2
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
+; VI-DENORM:       ; %bb.0:
+; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    v_fma_f16 v2, v4, 2.0, -v2
+; VI-DENORM-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
+; GFX10-DENORM:       ; %bb.0:
+; GFX10-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
+; GFX10-DENORM-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
+; GFX11-DENORM:       ; %bb.0:
+; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
+; GFX11-DENORM-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-NEXT:    s_nop 0
+; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -324,24 +1336,158 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
   ret void
 }
 
-; GCN-LABEL: {{^}}mad_sub_f16:
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]]
-
-; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
-
-; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
-
-; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
-
-; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
-; GFX10:       global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
 define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
+; VI-FLUSH-LABEL: mad_sub_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, v2, -v3
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-CONTRACT-LABEL: mad_sub_f16:
+; VI-DENORM-CONTRACT:       ; %bb.0:
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, -v3
+; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: mad_sub_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-STRICT-LABEL: mad_sub_f16:
+; GFX10-DENORM-STRICT:       ; %bb.0:
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16:
+; GFX10-DENORM-CONTRACT:       ; %bb.0:
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: mad_sub_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-STRICT-LABEL: mad_sub_f16:
+; GFX11-DENORM-STRICT:       ; %bb.0:
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    s_nop 0
+; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16:
+; GFX11-DENORM-CONTRACT:       ; %bb.0:
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -v3
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
+; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -359,23 +1505,158 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
   ret void
 }
 
-; GCN-LABEL: {{^}}mad_sub_inv_f16:
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]]
-; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
-
-; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
-
-; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-
-; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-; GFX10: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
 define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
+; VI-FLUSH-LABEL: mad_sub_inv_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_mad_f16 v2, -v7, v2, v3
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
+; VI-DENORM-CONTRACT:       ; %bb.0:
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, -v7, v2, v3
+; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: mad_sub_inv_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16:
+; GFX10-DENORM-STRICT:       ; %bb.0:
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
+; GFX10-DENORM-CONTRACT:       ; %bb.0:
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: mad_sub_inv_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v3, v1
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16:
+; GFX11-DENORM-STRICT:       ; %bb.0:
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v3, v1
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    s_nop 0
+; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
+; GFX11-DENORM-CONTRACT:       ; %bb.0:
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, v3
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
+; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -393,23 +1674,158 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
   ret void
 }
 
-; GCN-LABEL: {{^}}mad_sub_fabs_f16:
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]]
-; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
-
-; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
-
-; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
-
-; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
-; GFX10:       global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
 define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
+; VI-FLUSH-LABEL: mad_sub_fabs_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, v2, -|v3|
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
+; VI-DENORM-CONTRACT:       ; %bb.0:
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, -|v3|
+; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: mad_sub_fabs_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
+; GFX10-DENORM-STRICT:       ; %bb.0:
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
+; GFX10-DENORM-CONTRACT:       ; %bb.0:
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: mad_sub_fabs_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_sub_f16_e64 v1, v1, |v3|
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
+; GFX11-DENORM-STRICT:       ; %bb.0:
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, v1, |v3|
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    s_nop 0
+; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
+; GFX11-DENORM-CONTRACT:       ; %bb.0:
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, v2, -|v3|
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
+; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -428,24 +1844,158 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
   ret void
 }
 
-; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16:
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]]
-
-; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
-
-; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
-
-; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
-
-; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
-; GFX10:       global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
 define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
+; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_mad_f16 v2, -v7, v2, |v3|
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
+; VI-DENORM-CONTRACT:       ; %bb.0:
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, -v7, v2, |v3|
+; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
+; GFX10-DENORM-STRICT:       ; %bb.0:
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
+; GFX10-DENORM-CONTRACT:       ; %bb.0:
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_sub_f16_e64 v1, |v3|, v1
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
+; GFX11-DENORM-STRICT:       ; %bb.0:
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e64 v1, |v3|, v1
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    s_nop 0
+; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
+; GFX11-DENORM-CONTRACT:       ; %bb.0:
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, -v1, v2, |v3|
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
+; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -464,27 +2014,158 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
   ret void
 }
 
-; GCN-LABEL: {{^}}neg_neg_mad_f16:
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]]
-
-; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
-; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
-
-; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
-; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
-
-; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
-; GFX10-FLUSH:  global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
-; GFX10-DENORM-STRICT: global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
-; GFX10-DENORM-CONTRACT: global_store_{{short|b16}} v{{[0-9]+}}, [[REGC]]
 define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
+; VI-FLUSH-LABEL: neg_neg_mad_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v3, v7, v2
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v3
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
+; VI-DENORM-CONTRACT:       ; %bb.0:
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, v2, v3
+; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: neg_neg_mad_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16:
+; GFX10-DENORM-STRICT:       ; %bb.0:
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
+; GFX10-DENORM-CONTRACT:       ; %bb.0:
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: neg_neg_mad_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v3, v1
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16:
+; GFX11-DENORM-STRICT:       ; %bb.0:
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v3, v1
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    s_nop 0
+; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
+; GFX11-DENORM-CONTRACT:       ; %bb.0:
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v3, v1, v2
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
+; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -504,24 +2185,158 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
   ret void
 }
 
-; GCN-LABEL: {{^}}mad_fabs_sub_f16:
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[REGC:v[0-9]+]]
-
-; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
-
-; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
-
-; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
-; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
-
-; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
-; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
-; GFX10:       global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
 define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 {
+; VI-FLUSH-LABEL: mad_fabs_sub_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v7, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v3, v[4:5] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_mad_f16 v2, v7, |v2|, -v3
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
+; VI-DENORM-CONTRACT:       ; %bb.0:
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v7, v[0:1] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v3, v[4:5] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v7, |v2|, -v3
+; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: mad_fabs_sub_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
+; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
+; GFX10-DENORM-STRICT:       ; %bb.0:
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
+; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
+; GFX10-DENORM-CONTRACT:       ; %bb.0:
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v3, v0, s[2:3] offset:4 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: mad_fabs_sub_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_mul_f16_e64 v1, v1, |v2|
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v3
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
+; GFX11-DENORM-STRICT:       ; %bb.0:
+; GFX11-DENORM-STRICT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    v_mul_f16_e64 v1, v1, |v2|
+; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v3
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    s_nop 0
+; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
+; GFX11-DENORM-CONTRACT:       ; %bb.0:
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, |v2|, -v3
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
+; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -540,26 +2355,132 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
   ret void
 }
 
-; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16:
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]],
-; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
-; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-
-; VI-DENORM-CONTRACT:    v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
-; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
-
-; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
-
-; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
-; GFX10-FLUSH:  global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
-; GFX10-DENORM-STRICT:   global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
-; GFX10-DENORM-CONTRACT: global_store_{{short|b16}} v{{[0-9]+}}, [[R2]]
 define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, -2.0, v4
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
+; VI-DENORM-CONTRACT:       ; %bb.0:
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v4, -2.0, v2
+; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
+; GFX10-DENORM-STRICT:       ; %bb.0:
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v2, v1
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
+; GFX10-DENORM-CONTRACT:       ; %bb.0:
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v2, v1
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
+; GFX11-DENORM-STRICT:       ; %bb.0:
+; GFX11-DENORM-STRICT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v2, v1
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    s_nop 0
+; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
+; GFX11-DENORM-CONTRACT:       ; %bb.0:
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    v_fmac_f16_e32 v2, -2.0, v1
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
+; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -575,23 +2496,132 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
   ret void
 }
 
-; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16:
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R1:v[0-9]+]],
-; GCN: {{buffer|flat|global}}_load_{{ushort|u16}} [[R2:v[0-9]+]],
-
-; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
-
-; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
-
-; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
-
-; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-
-; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
-; GFX10:       global_store_{{short|b16}} v{{[0-9]+}}, [[RESULT]]
 define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-FLUSH-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-FLUSH-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mad_f16 v2, v4, 2.0, -v2
+; VI-FLUSH-NEXT:    flat_store_short v[0:1], v2
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
+; VI-DENORM-CONTRACT:       ; %bb.0:
+; VI-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    v_add_u32_e32 v2, vcc, 2, v0
+; VI-DENORM-CONTRACT-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v4, v[0:1] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    flat_load_ushort v2, v[2:3] glc
+; VI-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-CONTRACT-NEXT:    v_fma_f16 v2, v4, 2.0, -v2
+; VI-DENORM-CONTRACT-NEXT:    flat_store_short v[0:1], v2
+; VI-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
+; GFX10-FLUSH-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
+; GFX10-DENORM-STRICT:       ; %bb.0:
+; GFX10-DENORM-STRICT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX10-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v2
+; GFX10-DENORM-STRICT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
+; GFX10-DENORM-CONTRACT:       ; %bb.0:
+; GFX10-DENORM-CONTRACT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
+; GFX10-DENORM-CONTRACT-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_sub_f16_e32 v1, v1, v2
+; GFX11-FLUSH-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
+; GFX11-DENORM-STRICT:       ; %bb.0:
+; GFX11-DENORM-STRICT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-NEXT:    v_add_f16_e32 v1, v1, v1
+; GFX11-DENORM-STRICT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-NEXT:    v_sub_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT:    s_nop 0
+; GFX11-DENORM-STRICT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-STRICT-NEXT:    s_endpgm
+;
+; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
+; GFX11-DENORM-CONTRACT:       ; %bb.0:
+; GFX11-DENORM-CONTRACT-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-NEXT:    v_fma_f16 v1, v1, 2.0, -v2
+; GFX11-DENORM-CONTRACT-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT:    s_nop 0
+; GFX11-DENORM-CONTRACT-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-CONTRACT-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
   %gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 017bc0cb2b0a8a..f90d338ffc4873 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -1,41 +1,227 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI-FLUSH %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI-DENORM %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10PLUS,GFX10PLUS-FLUSH %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10PLUS,GFX10PLUS-DENORM %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10PLUS,GFX10PLUS-FLUSH %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10PLUS,GFX10PLUS-DENORM %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-DENORM %s
 
 declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
 
-; GCN-LABEL: {{^}}fmuladd_f16
-; GCN: buffer_load_{{ushort|u16}} v[[A_F16:[0-9]+]]
-; GCN: buffer_load_{{ushort|u16}} v[[B_F16:[0-9]+]]
-; GCN: buffer_load_{{ushort|u16}} v[[C_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
-; SI:  v_mac_f32_e32 v[[C_F32]], v[[A_F32]], v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
-; SI:  buffer_store_short v[[R_F16]]
-
-; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
-; VI-FLUSH: buffer_store_short v[[C_F16]]
-
-; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
-; VI-DENORM: buffer_store_short [[RESULT]]
-
-; GFX10PLUS-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], v[[A_F16]], v[[B_F16]]
-; GFX10PLUS-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]]
-; GFX10PLUS-FLUSH: buffer_store_{{short|b16}} [[ADD]]
-
-; GFX10PLUS-DENORM: v_fmac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
-; GFX10PLUS-DENORM: buffer_store_{{short|b16}} v[[C_F16]],
-
-; GCN: s_endpgm
 define amdgpu_kernel void @fmuladd_f16(
+; SI-LABEL: fmuladd_f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s2
+; SI-NEXT:    s_mov_b32 s13, s3
+; SI-NEXT:    s_mov_b32 s16, s4
+; SI-NEXT:    s_mov_b32 s17, s5
+; SI-NEXT:    s_mov_b32 s18, s10
+; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; SI-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
+; SI-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_mac_f32_e32 v2, v0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v2
+; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-FLUSH-LABEL: fmuladd_f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_mov_b32 s11, 0xf000
+; VI-FLUSH-NEXT:    s_mov_b32 s10, -1
+; VI-FLUSH-NEXT:    s_mov_b32 s14, s10
+; VI-FLUSH-NEXT:    s_mov_b32 s15, s11
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    s_mov_b32 s12, s2
+; VI-FLUSH-NEXT:    s_mov_b32 s13, s3
+; VI-FLUSH-NEXT:    s_mov_b32 s16, s4
+; VI-FLUSH-NEXT:    s_mov_b32 s17, s5
+; VI-FLUSH-NEXT:    s_mov_b32 s18, s10
+; VI-FLUSH-NEXT:    s_mov_b32 s19, s11
+; VI-FLUSH-NEXT:    s_mov_b32 s4, s6
+; VI-FLUSH-NEXT:    s_mov_b32 s5, s7
+; VI-FLUSH-NEXT:    s_mov_b32 s6, s10
+; VI-FLUSH-NEXT:    s_mov_b32 s7, s11
+; VI-FLUSH-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; VI-FLUSH-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
+; VI-FLUSH-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; VI-FLUSH-NEXT:    s_mov_b32 s8, s0
+; VI-FLUSH-NEXT:    s_mov_b32 s9, s1
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v2, v0, v1
+; VI-FLUSH-NEXT:    buffer_store_short v2, off, s[8:11], 0
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-LABEL: fmuladd_f16:
+; VI-DENORM:       ; %bb.0:
+; VI-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-NEXT:    s_mov_b32 s11, 0xf000
+; VI-DENORM-NEXT:    s_mov_b32 s10, -1
+; VI-DENORM-NEXT:    s_mov_b32 s14, s10
+; VI-DENORM-NEXT:    s_mov_b32 s15, s11
+; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT:    s_mov_b32 s12, s2
+; VI-DENORM-NEXT:    s_mov_b32 s13, s3
+; VI-DENORM-NEXT:    s_mov_b32 s16, s4
+; VI-DENORM-NEXT:    s_mov_b32 s17, s5
+; VI-DENORM-NEXT:    s_mov_b32 s18, s10
+; VI-DENORM-NEXT:    s_mov_b32 s19, s11
+; VI-DENORM-NEXT:    s_mov_b32 s4, s6
+; VI-DENORM-NEXT:    s_mov_b32 s5, s7
+; VI-DENORM-NEXT:    s_mov_b32 s6, s10
+; VI-DENORM-NEXT:    s_mov_b32 s7, s11
+; VI-DENORM-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; VI-DENORM-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
+; VI-DENORM-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; VI-DENORM-NEXT:    s_mov_b32 s8, s0
+; VI-DENORM-NEXT:    s_mov_b32 s9, s1
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    v_fma_f16 v0, v0, v1, v2
+; VI-DENORM-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; VI-DENORM-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmuladd_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_mov_b32 s10, -1
+; GFX10-FLUSH-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX10-FLUSH-NEXT:    s_mov_b32 s14, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s15, s11
+; GFX10-FLUSH-NEXT:    s_mov_b32 s18, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s19, s11
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    s_mov_b32 s12, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s13, s3
+; GFX10-FLUSH-NEXT:    s_mov_b32 s16, s4
+; GFX10-FLUSH-NEXT:    s_mov_b32 s17, s5
+; GFX10-FLUSH-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; GFX10-FLUSH-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
+; GFX10-FLUSH-NEXT:    s_mov_b32 s4, s6
+; GFX10-FLUSH-NEXT:    s_mov_b32 s5, s7
+; GFX10-FLUSH-NEXT:    s_mov_b32 s6, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s7, s11
+; GFX10-FLUSH-NEXT:    s_mov_b32 s8, s0
+; GFX10-FLUSH-NEXT:    buffer_load_ushort v2, off, s[4:7], 0
+; GFX10-FLUSH-NEXT:    s_mov_b32 s9, s1
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX10-FLUSH-NEXT:    buffer_store_short v0, off, s[8:11], 0
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-LABEL: fmuladd_f16:
+; GFX10-DENORM:       ; %bb.0:
+; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-NEXT:    s_mov_b32 s10, -1
+; GFX10-DENORM-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX10-DENORM-NEXT:    s_mov_b32 s14, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s15, s11
+; GFX10-DENORM-NEXT:    s_mov_b32 s18, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s19, s11
+; GFX10-DENORM-NEXT:    s_mov_b32 s22, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s23, s11
+; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT:    s_mov_b32 s12, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s13, s3
+; GFX10-DENORM-NEXT:    s_mov_b32 s16, s4
+; GFX10-DENORM-NEXT:    s_mov_b32 s17, s5
+; GFX10-DENORM-NEXT:    s_mov_b32 s20, s6
+; GFX10-DENORM-NEXT:    s_mov_b32 s21, s7
+; GFX10-DENORM-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; GFX10-DENORM-NEXT:    buffer_load_ushort v1, off, s[16:19], 0
+; GFX10-DENORM-NEXT:    buffer_load_ushort v2, off, s[20:23], 0
+; GFX10-DENORM-NEXT:    s_mov_b32 s8, s0
+; GFX10-DENORM-NEXT:    s_mov_b32 s9, s1
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v2, v0, v1
+; GFX10-DENORM-NEXT:    buffer_store_short v2, off, s[8:11], 0
+; GFX10-DENORM-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmuladd_f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_mov_b32 s10, -1
+; GFX11-FLUSH-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-FLUSH-NEXT:    s_mov_b32 s14, s10
+; GFX11-FLUSH-NEXT:    s_mov_b32 s15, s11
+; GFX11-FLUSH-NEXT:    s_mov_b32 s18, s10
+; GFX11-FLUSH-NEXT:    s_mov_b32 s19, s11
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    s_mov_b32 s12, s2
+; GFX11-FLUSH-NEXT:    s_mov_b32 s13, s3
+; GFX11-FLUSH-NEXT:    s_mov_b32 s16, s4
+; GFX11-FLUSH-NEXT:    s_mov_b32 s17, s5
+; GFX11-FLUSH-NEXT:    buffer_load_u16 v0, off, s[12:15], 0
+; GFX11-FLUSH-NEXT:    buffer_load_u16 v1, off, s[16:19], 0
+; GFX11-FLUSH-NEXT:    s_mov_b32 s4, s6
+; GFX11-FLUSH-NEXT:    s_mov_b32 s5, s7
+; GFX11-FLUSH-NEXT:    s_mov_b32 s6, s10
+; GFX11-FLUSH-NEXT:    s_mov_b32 s7, s11
+; GFX11-FLUSH-NEXT:    s_mov_b32 s8, s0
+; GFX11-FLUSH-NEXT:    buffer_load_u16 v2, off, s[4:7], 0
+; GFX11-FLUSH-NEXT:    s_mov_b32 s9, s1
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v0, v0, v2
+; GFX11-FLUSH-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-LABEL: fmuladd_f16:
+; GFX11-DENORM:       ; %bb.0:
+; GFX11-DENORM-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-NEXT:    s_mov_b32 s10, -1
+; GFX11-DENORM-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-DENORM-NEXT:    s_mov_b32 s14, s10
+; GFX11-DENORM-NEXT:    s_mov_b32 s15, s11
+; GFX11-DENORM-NEXT:    s_mov_b32 s18, s10
+; GFX11-DENORM-NEXT:    s_mov_b32 s19, s11
+; GFX11-DENORM-NEXT:    s_mov_b32 s22, s10
+; GFX11-DENORM-NEXT:    s_mov_b32 s23, s11
+; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-NEXT:    s_mov_b32 s12, s2
+; GFX11-DENORM-NEXT:    s_mov_b32 s13, s3
+; GFX11-DENORM-NEXT:    s_mov_b32 s16, s4
+; GFX11-DENORM-NEXT:    s_mov_b32 s17, s5
+; GFX11-DENORM-NEXT:    s_mov_b32 s20, s6
+; GFX11-DENORM-NEXT:    s_mov_b32 s21, s7
+; GFX11-DENORM-NEXT:    buffer_load_u16 v0, off, s[12:15], 0
+; GFX11-DENORM-NEXT:    buffer_load_u16 v1, off, s[16:19], 0
+; GFX11-DENORM-NEXT:    buffer_load_u16 v2, off, s[20:23], 0
+; GFX11-DENORM-NEXT:    s_mov_b32 s8, s0
+; GFX11-DENORM-NEXT:    s_mov_b32 s9, s1
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v2, v0, v1
+; GFX11-DENORM-NEXT:    buffer_store_b16 v2, off, s[8:11], 0
+; GFX11-DENORM-NEXT:    s_nop 0
+; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b,
@@ -48,31 +234,182 @@ define amdgpu_kernel void @fmuladd_f16(
   ret void
 }
 
-; GCN-LABEL: {{^}}fmuladd_f16_imm_a
-; GCN: buffer_load_{{ushort|u16}} v[[B_F16:[0-9]+]]
-; GCN: buffer_load_{{ushort|u16}} v[[C_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
-; SI:  v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[B_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
-; SI:  buffer_store_short v[[R_F16]]
-
-; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]]
-; VI-FLUSH: buffer_store_short v[[C_F16]]
-
-; VI-DENORM: s_movk_i32 [[KA:s[0-9]+]], 0x4200
-; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]]
-; VI-DENORM: buffer_store_short [[RESULT]]
-
-; GFX10PLUS-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[B_F16]]
-; GFX10PLUS-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]]
-; GFX10PLUS-FLUSH: buffer_store_{{short|b16}} [[ADD]]
-
-; GFX10PLUS-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]]
-; GFX10PLUS-DENORM: buffer_store_{{short|b16}} v[[C_F16]],
-
-; GCN: s_endpgm
 define amdgpu_kernel void @fmuladd_f16_imm_a(
+; SI-LABEL: fmuladd_f16_imm_a:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s14, s2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
+; SI-NEXT:    s_mov_b32 s15, s3
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_mac_f32_e32 v1, 0x40400000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-FLUSH-LABEL: fmuladd_f16_imm_a:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; VI-FLUSH-NEXT:    s_mov_b32 s2, -1
+; VI-FLUSH-NEXT:    s_mov_b32 s14, s2
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    s_mov_b32 s12, s6
+; VI-FLUSH-NEXT:    s_mov_b32 s13, s7
+; VI-FLUSH-NEXT:    s_mov_b32 s15, s3
+; VI-FLUSH-NEXT:    s_mov_b32 s10, s2
+; VI-FLUSH-NEXT:    s_mov_b32 s11, s3
+; VI-FLUSH-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    s_mov_b32 s0, s4
+; VI-FLUSH-NEXT:    s_mov_b32 s1, s5
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v1, 0x4200, v0
+; VI-FLUSH-NEXT:    buffer_store_short v1, off, s[0:3], 0
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-LABEL: fmuladd_f16_imm_a:
+; VI-DENORM:       ; %bb.0:
+; VI-DENORM-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-DENORM-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-DENORM-NEXT:    s_mov_b32 s3, 0xf000
+; VI-DENORM-NEXT:    s_mov_b32 s2, -1
+; VI-DENORM-NEXT:    s_mov_b32 s14, s2
+; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT:    s_mov_b32 s12, s6
+; VI-DENORM-NEXT:    s_mov_b32 s13, s7
+; VI-DENORM-NEXT:    s_mov_b32 s15, s3
+; VI-DENORM-NEXT:    s_mov_b32 s10, s2
+; VI-DENORM-NEXT:    s_mov_b32 s11, s3
+; VI-DENORM-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    s_mov_b32 s0, s4
+; VI-DENORM-NEXT:    s_movk_i32 s4, 0x4200
+; VI-DENORM-NEXT:    s_mov_b32 s1, s5
+; VI-DENORM-NEXT:    v_fma_f16 v0, v0, s4, v1
+; VI-DENORM-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-DENORM-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmuladd_f16_imm_a:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_clause 0x1
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX10-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GFX10-FLUSH-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-FLUSH-NEXT:    s_mov_b32 s14, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s15, s3
+; GFX10-FLUSH-NEXT:    s_mov_b32 s10, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s11, s3
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    s_mov_b32 s12, s6
+; GFX10-FLUSH-NEXT:    s_mov_b32 s13, s7
+; GFX10-FLUSH-NEXT:    s_mov_b32 s0, s4
+; GFX10-FLUSH-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    s_mov_b32 s1, s5
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX10-FLUSH-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-LABEL: fmuladd_f16_imm_a:
+; GFX10-DENORM:       ; %bb.0:
+; GFX10-DENORM-NEXT:    s_clause 0x1
+; GFX10-DENORM-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX10-DENORM-NEXT:    s_mov_b32 s2, -1
+; GFX10-DENORM-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-DENORM-NEXT:    s_mov_b32 s14, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s15, s3
+; GFX10-DENORM-NEXT:    s_mov_b32 s10, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s11, s3
+; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT:    s_mov_b32 s12, s6
+; GFX10-DENORM-NEXT:    s_mov_b32 s13, s7
+; GFX10-DENORM-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    s_mov_b32 s0, s4
+; GFX10-DENORM-NEXT:    s_mov_b32 s1, s5
+; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v1, 0x4200, v0
+; GFX10-DENORM-NEXT:    buffer_store_short v1, off, s[0:3], 0
+; GFX10-DENORM-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmuladd_f16_imm_a:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_clause 0x1
+; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FLUSH-NEXT:    s_mov_b32 s10, -1
+; GFX11-FLUSH-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-FLUSH-NEXT:    s_mov_b32 s14, s10
+; GFX11-FLUSH-NEXT:    s_mov_b32 s15, s11
+; GFX11-FLUSH-NEXT:    s_mov_b32 s2, s10
+; GFX11-FLUSH-NEXT:    s_mov_b32 s3, s11
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    s_mov_b32 s12, s6
+; GFX11-FLUSH-NEXT:    s_mov_b32 s13, s7
+; GFX11-FLUSH-NEXT:    s_mov_b32 s8, s4
+; GFX11-FLUSH-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    s_mov_b32 s9, s5
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-FLUSH-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-LABEL: fmuladd_f16_imm_a:
+; GFX11-DENORM:       ; %bb.0:
+; GFX11-DENORM-NEXT:    s_clause 0x1
+; GFX11-DENORM-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DENORM-NEXT:    s_mov_b32 s10, -1
+; GFX11-DENORM-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-DENORM-NEXT:    s_mov_b32 s14, s10
+; GFX11-DENORM-NEXT:    s_mov_b32 s15, s11
+; GFX11-DENORM-NEXT:    s_mov_b32 s2, s10
+; GFX11-DENORM-NEXT:    s_mov_b32 s3, s11
+; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-NEXT:    s_mov_b32 s12, s6
+; GFX11-DENORM-NEXT:    s_mov_b32 s13, s7
+; GFX11-DENORM-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    s_mov_b32 s8, s4
+; GFX11-DENORM-NEXT:    s_mov_b32 s9, s5
+; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v1, 0x4200, v0
+; GFX11-DENORM-NEXT:    buffer_store_b16 v1, off, s[8:11], 0
+; GFX11-DENORM-NEXT:    s_nop 0
+; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %b,
     ptr addrspace(1) %c) {
@@ -83,31 +420,182 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
   ret void
 }
 
-; GCN-LABEL: {{^}}fmuladd_f16_imm_b
-; GCN: buffer_load_{{ushort|u16}} v[[A_F16:[0-9]+]]
-; GCN: buffer_load_{{ushort|u16}} v[[C_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
-; SI:  v_mac_f32_e32 v[[C_F32]], 0x40400000, v[[A_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[C_F32]]
-; SI:  buffer_store_short v[[R_F16]]
-
-; VI-FLUSH: v_mac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]]
-; VI-FLUSH: buffer_store_short v[[C_F16]]
-
-; VI-DENORM: s_movk_i32 [[KA:s[0-9]+]], 0x4200
-; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]]
-; VI-DENORM: buffer_store_short [[RESULT]]
-
-; GFX10PLUS-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[A_F16]]
-; GFX10PLUS-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]]
-; GFX10PLUS-FLUSH: buffer_store_{{short|b16}} [[ADD]]
-
-; GFX10PLUS-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]]
-; GFX10PLUS-DENORM: buffer_store_{{short|b16}} v[[C_F16]],
-
-; GCN: s_endpgm
 define amdgpu_kernel void @fmuladd_f16_imm_b(
+; SI-LABEL: fmuladd_f16_imm_b:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s14, s2
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s6
+; SI-NEXT:    s_mov_b32 s13, s7
+; SI-NEXT:    s_mov_b32 s15, s3
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_mac_f32_e32 v1, 0x40400000, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v1
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-FLUSH-LABEL: fmuladd_f16_imm_b:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; VI-FLUSH-NEXT:    s_mov_b32 s2, -1
+; VI-FLUSH-NEXT:    s_mov_b32 s14, s2
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    s_mov_b32 s12, s6
+; VI-FLUSH-NEXT:    s_mov_b32 s13, s7
+; VI-FLUSH-NEXT:    s_mov_b32 s15, s3
+; VI-FLUSH-NEXT:    s_mov_b32 s10, s2
+; VI-FLUSH-NEXT:    s_mov_b32 s11, s3
+; VI-FLUSH-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    s_mov_b32 s0, s4
+; VI-FLUSH-NEXT:    s_mov_b32 s1, s5
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v1, 0x4200, v0
+; VI-FLUSH-NEXT:    buffer_store_short v1, off, s[0:3], 0
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-LABEL: fmuladd_f16_imm_b:
+; VI-DENORM:       ; %bb.0:
+; VI-DENORM-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-DENORM-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-DENORM-NEXT:    s_mov_b32 s3, 0xf000
+; VI-DENORM-NEXT:    s_mov_b32 s2, -1
+; VI-DENORM-NEXT:    s_mov_b32 s14, s2
+; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT:    s_mov_b32 s12, s6
+; VI-DENORM-NEXT:    s_mov_b32 s13, s7
+; VI-DENORM-NEXT:    s_mov_b32 s15, s3
+; VI-DENORM-NEXT:    s_mov_b32 s10, s2
+; VI-DENORM-NEXT:    s_mov_b32 s11, s3
+; VI-DENORM-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    s_mov_b32 s0, s4
+; VI-DENORM-NEXT:    s_movk_i32 s4, 0x4200
+; VI-DENORM-NEXT:    s_mov_b32 s1, s5
+; VI-DENORM-NEXT:    v_fma_f16 v0, v0, s4, v1
+; VI-DENORM-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-DENORM-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmuladd_f16_imm_b:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_clause 0x1
+; GFX10-FLUSH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX10-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GFX10-FLUSH-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-FLUSH-NEXT:    s_mov_b32 s14, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s15, s3
+; GFX10-FLUSH-NEXT:    s_mov_b32 s10, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s11, s3
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    s_mov_b32 s12, s6
+; GFX10-FLUSH-NEXT:    s_mov_b32 s13, s7
+; GFX10-FLUSH-NEXT:    s_mov_b32 s0, s4
+; GFX10-FLUSH-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc dlc
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    s_mov_b32 s1, s5
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
+; GFX10-FLUSH-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX10-FLUSH-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-LABEL: fmuladd_f16_imm_b:
+; GFX10-DENORM:       ; %bb.0:
+; GFX10-DENORM-NEXT:    s_clause 0x1
+; GFX10-DENORM-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DENORM-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX10-DENORM-NEXT:    s_mov_b32 s2, -1
+; GFX10-DENORM-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-DENORM-NEXT:    s_mov_b32 s14, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s15, s3
+; GFX10-DENORM-NEXT:    s_mov_b32 s10, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s11, s3
+; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT:    s_mov_b32 s12, s6
+; GFX10-DENORM-NEXT:    s_mov_b32 s13, s7
+; GFX10-DENORM-NEXT:    buffer_load_ushort v0, off, s[12:15], 0 glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 glc dlc
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    s_mov_b32 s0, s4
+; GFX10-DENORM-NEXT:    s_mov_b32 s1, s5
+; GFX10-DENORM-NEXT:    v_fmac_f16_e32 v1, 0x4200, v0
+; GFX10-DENORM-NEXT:    buffer_store_short v1, off, s[0:3], 0
+; GFX10-DENORM-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmuladd_f16_imm_b:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_clause 0x1
+; GFX11-FLUSH-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FLUSH-NEXT:    s_mov_b32 s10, -1
+; GFX11-FLUSH-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-FLUSH-NEXT:    s_mov_b32 s14, s10
+; GFX11-FLUSH-NEXT:    s_mov_b32 s15, s11
+; GFX11-FLUSH-NEXT:    s_mov_b32 s2, s10
+; GFX11-FLUSH-NEXT:    s_mov_b32 s3, s11
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    s_mov_b32 s12, s6
+; GFX11-FLUSH-NEXT:    s_mov_b32 s13, s7
+; GFX11-FLUSH-NEXT:    s_mov_b32 s8, s4
+; GFX11-FLUSH-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    s_mov_b32 s9, s5
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v0, 0x4200, v0
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_add_f16_e32 v0, v0, v1
+; GFX11-FLUSH-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-LABEL: fmuladd_f16_imm_b:
+; GFX11-DENORM:       ; %bb.0:
+; GFX11-DENORM-NEXT:    s_clause 0x1
+; GFX11-DENORM-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DENORM-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DENORM-NEXT:    s_mov_b32 s10, -1
+; GFX11-DENORM-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-DENORM-NEXT:    s_mov_b32 s14, s10
+; GFX11-DENORM-NEXT:    s_mov_b32 s15, s11
+; GFX11-DENORM-NEXT:    s_mov_b32 s2, s10
+; GFX11-DENORM-NEXT:    s_mov_b32 s3, s11
+; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-NEXT:    s_mov_b32 s12, s6
+; GFX11-DENORM-NEXT:    s_mov_b32 s13, s7
+; GFX11-DENORM-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    s_mov_b32 s8, s4
+; GFX11-DENORM-NEXT:    s_mov_b32 s9, s5
+; GFX11-DENORM-NEXT:    v_fmac_f16_e32 v1, 0x4200, v0
+; GFX11-DENORM-NEXT:    buffer_store_b16 v1, off, s[8:11], 0
+; GFX11-DENORM-NEXT:    s_nop 0
+; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %c) {
@@ -118,64 +606,240 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
   ret void
 }
 
-; GCN-LABEL: {{^}}fmuladd_v2f16
-; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
-
-; VI-FLUSH: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; VI-FLUSH: buffer_load_dword v[[C_V2_F16:[0-9]+]]
-; VI-FLUSH: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-
-; VI-DENORM: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]]
-
-; GFX10PLUS: buffer_load_{{dword|b32}} v[[A_V2_F16:[0-9]+]]
-; GFX10PLUS: buffer_load_{{dword|b32}} v[[B_V2_F16:[0-9]+]]
-; GFX10PLUS: buffer_load_{{dword|b32}} v[[C_V2_F16:[0-9]+]]
-
-; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-
-; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-
-; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI-DAG:  v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
-; SI-DAG:  v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
-; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
-; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
-; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
-
-; VI-FLUSH:     v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-FLUSH-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
-; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
-; VI-FLUSH-NOT: v_and_b32
-; VI-FLUSH:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]]
-
-; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]]
-; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
-; VI-DENORM-NOT: v_and_b32
-; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
-
-; GFX10PLUS-FLUSH: v_pk_mul_f16 [[MUL:v[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
-; GFX10PLUS-FLUSH: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[MUL]], v[[C_V2_F16]]
-
-; GFX10PLUS-DENORM: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
-
-; GCN: buffer_store_{{dword|b32}} v[[R_V2_F16]]
 define amdgpu_kernel void @fmuladd_v2f16(
+; SI-LABEL: fmuladd_v2f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s11, 0xf000
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s14, s10
+; SI-NEXT:    s_mov_b32 s15, s11
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s12, s2
+; SI-NEXT:    s_mov_b32 s13, s3
+; SI-NEXT:    s_mov_b32 s16, s4
+; SI-NEXT:    s_mov_b32 s17, s5
+; SI-NEXT:    s_mov_b32 s18, s10
+; SI-NEXT:    s_mov_b32 s19, s11
+; SI-NEXT:    s_mov_b32 s4, s6
+; SI-NEXT:    s_mov_b32 s5, s7
+; SI-NEXT:    s_mov_b32 s6, s10
+; SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; SI-NEXT:    s_mov_b32 s7, s11
+; SI-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    s_mov_b32 s8, s0
+; SI-NEXT:    s_mov_b32 s9, s1
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_mac_f32_e32 v5, v0, v4
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v5
+; SI-NEXT:    v_mac_f32_e32 v2, v3, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-FLUSH-LABEL: fmuladd_v2f16:
+; VI-FLUSH:       ; %bb.0:
+; VI-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT:    s_mov_b32 s11, 0xf000
+; VI-FLUSH-NEXT:    s_mov_b32 s10, -1
+; VI-FLUSH-NEXT:    s_mov_b32 s14, s10
+; VI-FLUSH-NEXT:    s_mov_b32 s15, s11
+; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-FLUSH-NEXT:    s_mov_b32 s12, s2
+; VI-FLUSH-NEXT:    s_mov_b32 s13, s3
+; VI-FLUSH-NEXT:    s_mov_b32 s16, s4
+; VI-FLUSH-NEXT:    s_mov_b32 s17, s5
+; VI-FLUSH-NEXT:    s_mov_b32 s18, s10
+; VI-FLUSH-NEXT:    s_mov_b32 s19, s11
+; VI-FLUSH-NEXT:    s_mov_b32 s4, s6
+; VI-FLUSH-NEXT:    s_mov_b32 s5, s7
+; VI-FLUSH-NEXT:    s_mov_b32 s6, s10
+; VI-FLUSH-NEXT:    s_mov_b32 s7, s11
+; VI-FLUSH-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; VI-FLUSH-NEXT:    buffer_load_dword v1, off, s[4:7], 0
+; VI-FLUSH-NEXT:    buffer_load_dword v2, off, s[16:19], 0
+; VI-FLUSH-NEXT:    s_mov_b32 s8, s0
+; VI-FLUSH-NEXT:    s_mov_b32 s9, s1
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(1)
+; VI-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; VI-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; VI-FLUSH-NEXT:    v_mac_f16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-FLUSH-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VI-FLUSH-NEXT:    v_mac_f16_e32 v1, v0, v2
+; VI-FLUSH-NEXT:    v_or_b32_e32 v0, v1, v3
+; VI-FLUSH-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; VI-FLUSH-NEXT:    s_endpgm
+;
+; VI-DENORM-LABEL: fmuladd_v2f16:
+; VI-DENORM:       ; %bb.0:
+; VI-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-NEXT:    s_mov_b32 s11, 0xf000
+; VI-DENORM-NEXT:    s_mov_b32 s10, -1
+; VI-DENORM-NEXT:    s_mov_b32 s14, s10
+; VI-DENORM-NEXT:    s_mov_b32 s15, s11
+; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-DENORM-NEXT:    s_mov_b32 s16, s4
+; VI-DENORM-NEXT:    s_mov_b32 s17, s5
+; VI-DENORM-NEXT:    s_mov_b32 s4, s6
+; VI-DENORM-NEXT:    s_mov_b32 s5, s7
+; VI-DENORM-NEXT:    s_mov_b32 s6, s10
+; VI-DENORM-NEXT:    s_mov_b32 s7, s11
+; VI-DENORM-NEXT:    s_mov_b32 s12, s2
+; VI-DENORM-NEXT:    s_mov_b32 s13, s3
+; VI-DENORM-NEXT:    s_mov_b32 s18, s10
+; VI-DENORM-NEXT:    s_mov_b32 s19, s11
+; VI-DENORM-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; VI-DENORM-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; VI-DENORM-NEXT:    buffer_load_dword v2, off, s[12:15], 0
+; VI-DENORM-NEXT:    s_mov_b32 s8, s0
+; VI-DENORM-NEXT:    s_mov_b32 s9, s1
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(2)
+; VI-DENORM-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(1)
+; VI-DENORM-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; VI-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; VI-DENORM-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; VI-DENORM-NEXT:    v_fma_f16 v3, v5, v4, v3
+; VI-DENORM-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VI-DENORM-NEXT:    v_fma_f16 v0, v2, v1, v0
+; VI-DENORM-NEXT:    v_or_b32_e32 v0, v0, v3
+; VI-DENORM-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; VI-DENORM-NEXT:    s_endpgm
+;
+; GFX10-FLUSH-LABEL: fmuladd_v2f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT:    s_mov_b32 s10, -1
+; GFX10-FLUSH-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX10-FLUSH-NEXT:    s_mov_b32 s14, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s15, s11
+; GFX10-FLUSH-NEXT:    s_mov_b32 s18, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s19, s11
+; GFX10-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    s_mov_b32 s12, s2
+; GFX10-FLUSH-NEXT:    s_mov_b32 s13, s3
+; GFX10-FLUSH-NEXT:    s_mov_b32 s16, s4
+; GFX10-FLUSH-NEXT:    s_mov_b32 s17, s5
+; GFX10-FLUSH-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; GFX10-FLUSH-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; GFX10-FLUSH-NEXT:    s_mov_b32 s4, s6
+; GFX10-FLUSH-NEXT:    s_mov_b32 s5, s7
+; GFX10-FLUSH-NEXT:    s_mov_b32 s6, s10
+; GFX10-FLUSH-NEXT:    s_mov_b32 s7, s11
+; GFX10-FLUSH-NEXT:    s_mov_b32 s8, s0
+; GFX10-FLUSH-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; GFX10-FLUSH-NEXT:    s_mov_b32 s9, s1
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-FLUSH-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLUSH-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX10-FLUSH-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX10-FLUSH-NEXT:    s_endpgm
+;
+; GFX10-DENORM-LABEL: fmuladd_v2f16:
+; GFX10-DENORM:       ; %bb.0:
+; GFX10-DENORM-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-NEXT:    s_mov_b32 s10, -1
+; GFX10-DENORM-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX10-DENORM-NEXT:    s_mov_b32 s14, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s15, s11
+; GFX10-DENORM-NEXT:    s_mov_b32 s18, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s19, s11
+; GFX10-DENORM-NEXT:    s_mov_b32 s22, s10
+; GFX10-DENORM-NEXT:    s_mov_b32 s23, s11
+; GFX10-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-DENORM-NEXT:    s_mov_b32 s12, s2
+; GFX10-DENORM-NEXT:    s_mov_b32 s13, s3
+; GFX10-DENORM-NEXT:    s_mov_b32 s16, s4
+; GFX10-DENORM-NEXT:    s_mov_b32 s17, s5
+; GFX10-DENORM-NEXT:    s_mov_b32 s20, s6
+; GFX10-DENORM-NEXT:    s_mov_b32 s21, s7
+; GFX10-DENORM-NEXT:    buffer_load_dword v0, off, s[12:15], 0
+; GFX10-DENORM-NEXT:    buffer_load_dword v1, off, s[16:19], 0
+; GFX10-DENORM-NEXT:    buffer_load_dword v2, off, s[20:23], 0
+; GFX10-DENORM-NEXT:    s_mov_b32 s8, s0
+; GFX10-DENORM-NEXT:    s_mov_b32 s9, s1
+; GFX10-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-DENORM-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX10-DENORM-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX10-DENORM-NEXT:    s_endpgm
+;
+; GFX11-FLUSH-LABEL: fmuladd_v2f16:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT:    s_mov_b32 s10, -1
+; GFX11-FLUSH-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-FLUSH-NEXT:    s_mov_b32 s14, s10
+; GFX11-FLUSH-NEXT:    s_mov_b32 s15, s11
+; GFX11-FLUSH-NEXT:    s_mov_b32 s18, s10
+; GFX11-FLUSH-NEXT:    s_mov_b32 s19, s11
+; GFX11-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    s_mov_b32 s12, s2
+; GFX11-FLUSH-NEXT:    s_mov_b32 s13, s3
+; GFX11-FLUSH-NEXT:    s_mov_b32 s16, s4
+; GFX11-FLUSH-NEXT:    s_mov_b32 s17, s5
+; GFX11-FLUSH-NEXT:    buffer_load_b32 v0, off, s[12:15], 0
+; GFX11-FLUSH-NEXT:    buffer_load_b32 v1, off, s[16:19], 0
+; GFX11-FLUSH-NEXT:    s_mov_b32 s4, s6
+; GFX11-FLUSH-NEXT:    s_mov_b32 s5, s7
+; GFX11-FLUSH-NEXT:    s_mov_b32 s6, s10
+; GFX11-FLUSH-NEXT:    s_mov_b32 s7, s11
+; GFX11-FLUSH-NEXT:    s_mov_b32 s8, s0
+; GFX11-FLUSH-NEXT:    buffer_load_b32 v2, off, s[4:7], 0
+; GFX11-FLUSH-NEXT:    s_mov_b32 s9, s1
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FLUSH-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FLUSH-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-NEXT:    v_pk_add_f16 v0, v0, v2
+; GFX11-FLUSH-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT:    s_nop 0
+; GFX11-FLUSH-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FLUSH-NEXT:    s_endpgm
+;
+; GFX11-DENORM-LABEL: fmuladd_v2f16:
+; GFX11-DENORM:       ; %bb.0:
+; GFX11-DENORM-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-NEXT:    s_mov_b32 s10, -1
+; GFX11-DENORM-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-DENORM-NEXT:    s_mov_b32 s14, s10
+; GFX11-DENORM-NEXT:    s_mov_b32 s15, s11
+; GFX11-DENORM-NEXT:    s_mov_b32 s18, s10
+; GFX11-DENORM-NEXT:    s_mov_b32 s19, s11
+; GFX11-DENORM-NEXT:    s_mov_b32 s22, s10
+; GFX11-DENORM-NEXT:    s_mov_b32 s23, s11
+; GFX11-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-NEXT:    s_mov_b32 s12, s2
+; GFX11-DENORM-NEXT:    s_mov_b32 s13, s3
+; GFX11-DENORM-NEXT:    s_mov_b32 s16, s4
+; GFX11-DENORM-NEXT:    s_mov_b32 s17, s5
+; GFX11-DENORM-NEXT:    s_mov_b32 s20, s6
+; GFX11-DENORM-NEXT:    s_mov_b32 s21, s7
+; GFX11-DENORM-NEXT:    buffer_load_b32 v0, off, s[12:15], 0
+; GFX11-DENORM-NEXT:    buffer_load_b32 v1, off, s[16:19], 0
+; GFX11-DENORM-NEXT:    buffer_load_b32 v2, off, s[20:23], 0
+; GFX11-DENORM-NEXT:    s_mov_b32 s8, s0
+; GFX11-DENORM-NEXT:    s_mov_b32 s9, s1
+; GFX11-DENORM-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-DENORM-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
+; GFX11-DENORM-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-DENORM-NEXT:    s_nop 0
+; GFX11-DENORM-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-DENORM-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b,


        


More information about the llvm-commits mailing list