[llvm] [AMDGPU] Allow folding to FMAAK with SGPR and immediate operand on GFX10+ (PR #72266)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 14 07:13:22 PST 2023


https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/72266

Allow foldImmediate to create instructions like:

  v_fmaak_f32 v0, s0, v0, 0x42000000

This instruction has two "scalar values": s0 and 0x42000000. On GFX10+
this is allowed. This fold was originally implemented before the
compiler supported GFX10, when all ASICs were limited to one scalar
value.


>From 7d1344ee23e69a4cf626ad5b459adc5544f2e105 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 14 Nov 2023 14:51:42 +0000
Subject: [PATCH 1/2] [AMDGPU] Generate checks in madak.ll

---
 llvm/test/CodeGen/AMDGPU/madak.ll | 1475 ++++++++++++++++++++++++++---
 1 file changed, 1350 insertions(+), 125 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 50a3bb187c4ac54..a43e23e4aeeada1 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -1,28 +1,150 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8PLUS,MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8PLUS,MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,GFX10-MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,FMA,GFX10PLUS-FMA %s
-; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8PLUS,FMA,GFX940-FMA %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,GFX11-MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,FMA,GFX10PLUS-FMA %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-MAD %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-MAD %s
+
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX940-FMA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX10-FMA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-FMA %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare float @llvm.fabs.f32(float) nounwind readnone
 
-; GCN-LABEL: {{^}}madak_f32:
-; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
-; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
-; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
-; MAD:   v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; FMA:   v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
 define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
+; GFX6-LABEL: madak_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_madak_f32 v2, v2, v3, 0x41200000
+; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: madak_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_madak_f32 v2, v5, v2, 0x41200000
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: madak_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_madak_f32 v1, v1, v2, 0x41200000
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-MAD-LABEL: madak_f32:
+; GFX10-MAD:       ; %bb.0:
+; GFX10-MAD-NEXT:    s_clause 0x1
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT:    s_clause 0x1
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT:    v_madak_f32 v1, v1, v2, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT:    s_endpgm
+;
+; GFX11-MAD-LABEL: madak_f32:
+; GFX11-MAD:       ; %bb.0:
+; GFX11-MAD-NEXT:    s_clause 0x1
+; GFX11-MAD-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT:    s_clause 0x1
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT:    s_nop 0
+; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT:    s_endpgm
+;
+; GFX940-FMA-LABEL: madak_f32:
+; GFX940-FMA:       ; %bb.0:
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT:    s_endpgm
+;
+; GFX10-FMA-LABEL: madak_f32:
+; GFX10-FMA:       ; %bb.0:
+; GFX10-FMA-NEXT:    s_clause 0x1
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT:    s_clause 0x1
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT:    v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT:    s_endpgm
+;
+; GFX11-FMA-LABEL: madak_f32:
+; GFX11-FMA:       ; %bb.0:
+; GFX11-FMA-NEXT:    s_clause 0x1
+; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    s_clause 0x1
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT:    v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT:    s_nop 0
+; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
   %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
@@ -40,25 +162,183 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
 ; Make sure this is only folded with one use. This is a code size
 ; optimization and if we fold the immediate multiple times, we'll undo
 ; it.
-
-; GCN-LABEL: {{^}}madak_2_use_f32:
-; GFX9:         v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX6-DAG:     buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; GFX6-DAG:     buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX6-DAG:     buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; GFX8PLUS:     {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]],
-; GFX8PLUS:     {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]],
-; GFX8PLUS:     {{flat|global}}_load_{{dword|b32}} [[VC:v[0-9]+]],
-; GFX6-DAG:     v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX8-DAG:     v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; GFX10-MAD-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; FMA-DAG:      v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; MAD-DAG:      v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
-; GFX10PLUS-FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000
-; GFX940-FMA-DAG:v_fmac_f32_e32 [[VK]], [[VA]], [[VC]]
-; GCN:          s_endpgm
 define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
+; GFX6-LABEL: madak_2_use_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x41200000
+; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GFX6-NEXT:    v_madak_f32 v3, v2, v3, 0x41200000
+; GFX6-NEXT:    v_mac_f32_e32 v5, v2, v4
+; GFX6-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v5, v[0:1], s[4:7], 0 addr64 offset:4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: madak_2_use_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_load_dword v8, v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_load_dword v4, v[4:5] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x41200000
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_madak_f32 v6, v7, v8, 0x41200000
+; GFX8-NEXT:    v_mac_f32_e32 v5, v7, v4
+; GFX8-NEXT:    flat_store_dword v[0:1], v6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[2:3], v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: madak_2_use_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x41200000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
+; GFX9-NEXT:    v_mac_f32_e32 v4, v1, v3
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v4, s[2:3] offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-MAD-LABEL: madak_2_use_f32:
+; GFX10-MAD:       ; %bb.0:
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
+; GFX10-MAD-NEXT:    v_madak_f32 v1, v1, v3, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
+; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-MAD-NEXT:    s_endpgm
+;
+; GFX11-MAD-LABEL: madak_2_use_f32:
+; GFX11-MAD:       ; %bb.0:
+; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
+; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT:    global_load_b32 v3, v0, s[2:3] offset:8 glc dlc
+; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT:    v_dual_add_f32 v1, 0x41200000, v1 :: v_dual_add_f32 v2, 0x41200000, v2
+; GFX11-MAD-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[2:3] offset:4 dlc
+; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-MAD-NEXT:    s_nop 0
+; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT:    s_endpgm
+;
+; GFX940-FMA-LABEL: madak_2_use_f32:
+; GFX940-FMA:       ; %bb.0:
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v4, 0x41200000
+; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
+; GFX940-FMA-NEXT:    v_fmac_f32_e32 v4, v1, v3
+; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    s_endpgm
+;
+; GFX10-FMA-LABEL: madak_2_use_f32:
+; GFX10-FMA:       ; %bb.0:
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
+; GFX10-FMA-NEXT:    v_fmaak_f32 v1, v1, v3, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[2:3] offset:4
+; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-FMA-NEXT:    s_endpgm
+;
+; GFX11-FMA-LABEL: madak_2_use_f32:
+; GFX11-FMA:       ; %bb.0:
+; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
+; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT:    global_load_b32 v3, v0, s[2:3] offset:8 glc dlc
+; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT:    v_fmaak_f32 v2, v1, v2, 0x41200000
+; GFX11-FMA-NEXT:    v_fmaak_f32 v1, v1, v3, 0x41200000
+; GFX11-FMA-NEXT:    global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[2:3] offset:4 dlc
+; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FMA-NEXT:    s_nop 0
+; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
   %in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -82,12 +362,111 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
   ret void
 }
 
-; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
-; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
-; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
-; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
-; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
 define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a) #0 {
+; GFX6-LABEL: madak_m_inline_imm_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_madak_f32 v2, 4.0, v2, 0x41200000
+; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: madak_m_inline_imm_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_madak_f32 v2, 4.0, v3, 0x41200000
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: madak_m_inline_imm_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-MAD-LABEL: madak_m_inline_imm_f32:
+; GFX10-MAD:       ; %bb.0:
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT:    v_madak_f32 v1, 4.0, v1, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-MAD-NEXT:    s_endpgm
+;
+; GFX11-MAD-LABEL: madak_m_inline_imm_f32:
+; GFX11-MAD:       ; %bb.0:
+; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-MAD-NEXT:    s_nop 0
+; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT:    s_endpgm
+;
+; GFX940-FMA-LABEL: madak_m_inline_imm_f32:
+; GFX940-FMA:       ; %bb.0:
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT:    s_endpgm
+;
+; GFX10-FMA-LABEL: madak_m_inline_imm_f32:
+; GFX10-FMA:       ; %bb.0:
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10-FMA-NEXT:    s_endpgm
+;
+; GFX11-FMA-LABEL: madak_m_inline_imm_f32:
+; GFX11-FMA:       ; %bb.0:
+; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT:    v_fmaak_f32 v1, 4.0, v1, 0x41200000
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-FMA-NEXT:    s_nop 0
+; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -102,20 +481,139 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
 
 ; Make sure nothing weird happens with a value that is also allowed as
 ; an inline immediate.
-
-; GCN-LABEL: {{^}}madak_inline_imm_f32:
-; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
-; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
-; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
-; MAD:   v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
-; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
-; FMA:   v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
 define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
+; GFX6-LABEL: madak_inline_imm_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mad_f32 v2, v2, v3, 4.0
+; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: madak_inline_imm_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_f32 v2, v5, v2, 4.0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: madak_inline_imm_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mad_f32 v1, v1, v2, 4.0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-MAD-LABEL: madak_inline_imm_f32:
+; GFX10-MAD:       ; %bb.0:
+; GFX10-MAD-NEXT:    s_clause 0x1
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT:    s_clause 0x1
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT:    v_mad_f32 v1, v1, v2, 4.0
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT:    s_endpgm
+;
+; GFX11-MAD-LABEL: madak_inline_imm_f32:
+; GFX11-MAD:       ; %bb.0:
+; GFX11-MAD-NEXT:    s_clause 0x1
+; GFX11-MAD-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT:    s_clause 0x1
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT:    s_nop 0
+; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT:    s_endpgm
+;
+; GFX940-FMA-LABEL: madak_inline_imm_f32:
+; GFX940-FMA:       ; %bb.0:
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    v_fma_f32 v1, v1, v2, 4.0
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT:    s_endpgm
+;
+; GFX10-FMA-LABEL: madak_inline_imm_f32:
+; GFX10-FMA:       ; %bb.0:
+; GFX10-FMA-NEXT:    s_clause 0x1
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT:    s_clause 0x1
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT:    v_fma_f32 v1, v1, v2, 4.0
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT:    s_endpgm
+;
+; GFX11-FMA-LABEL: madak_inline_imm_f32:
+; GFX11-FMA:       ; %bb.0:
+; GFX11-FMA-NEXT:    s_clause 0x1
+; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    s_clause 0x1
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, v2, 4.0
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT:    s_nop 0
+; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
   %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
@@ -131,18 +629,125 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
 }
 
 ; We can't use an SGPR when forming madak
-; GCN-LABEL: {{^}}s_v_madak_f32:
-; GCN-DAG:      s_load_{{dword|b32}} [[SB:s[0-9]+]]
-; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG:      {{buffer|flat|global}}_load_{{dword|b32}}{{(_addtid)?}} [[VA:v[0-9]+]]
-; GCN-NOT:      v_madak_f32
-; GFX6_8_9:     v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
-; GFX10PLUS-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
-; GFX940-FMA:   v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]]
-; GFX11-MAD:    v_mul_f32_e32 [[VMUL:v[0-9]+]], [[SB]], [[VA]]
-; GFX11-MAD:    v_add_f32_e32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
 define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, float %b) #0 {
+; GFX6-LABEL: s_v_madak_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dword s8, s[0:1], 0xd
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0x41200000
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mac_f32_e32 v3, s8, v2
+; GFX6-NEXT:    buffer_store_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: s_v_madak_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mac_f32_e32 v2, s0, v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_v_madak_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mac_f32_e32 v2, s2, v1
+; GFX9-NEXT:    global_store_dword v0, v2, s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-MAD-LABEL: s_v_madak_f32:
+; GFX10-MAD:       ; %bb.0:
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT:    v_madak_f32 v1, s0, v1, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT:    s_endpgm
+;
+; GFX11-MAD-LABEL: s_v_madak_f32:
+; GFX11-MAD:       ; %bb.0:
+; GFX11-MAD-NEXT:    s_clause 0x1
+; GFX11-MAD-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT:    s_nop 0
+; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT:    s_endpgm
+;
+; GFX940-FMA-LABEL: s_v_madak_f32:
+; GFX940-FMA:       ; %bb.0:
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    v_fmac_f32_e32 v2, s2, v1
+; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT:    s_endpgm
+;
+; GFX10-FMA-LABEL: s_v_madak_f32:
+; GFX10-FMA:       ; %bb.0:
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT:    s_load_dword s0, s[0:1], 0x34
+; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT:    v_fmaak_f32 v1, s0, v1, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT:    s_endpgm
+;
+; GFX11-FMA-LABEL: s_v_madak_f32:
+; GFX11-FMA:       ; %bb.0:
+; GFX11-FMA-NEXT:    s_clause 0x1
+; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT:    v_fmaak_f32 v1, s0, v1, 0x41200000
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT:    s_nop 0
+; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -155,18 +760,136 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr
   ret void
 }
 
-; GCN-LABEL: @v_s_madak_f32
-; GCN-DAG:       s_load_{{dword|b32}} [[SB:s[0-9]+]]
-; GFX6_8_9-DAG:  v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG:       {{buffer|flat|global}}_load_{{dword|b32}}{{(_addtid)?}} [[VA:v[0-9]+]]
-; GFX6_8_9-NOT:  v_madak_f32
-; GFX6_8_9:      v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
-; GFX10PLUS-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
-; GFX940-FMA:    v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]]
-; GFX11-MAD:     v_mul_f32_e32 [[VMUL:v[0-9]+]], [[SB]], [[VA]]
-; GFX11-MAD:     v_add_f32_e32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
 define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a, ptr addrspace(1) noalias %in.b) #0 {
+; GFX6-LABEL: v_s_madak_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0x41200000
+; GFX6-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_mac_f32_e32 v3, s2, v2
+; GFX6-NEXT:    buffer_store_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: v_s_madak_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mac_f32_e32 v2, s0, v3
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_s_madak_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mac_f32_e32 v2, s4, v1
+; GFX9-NEXT:    global_store_dword v0, v2, s[2:3]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-MAD-LABEL: v_s_madak_f32:
+; GFX10-MAD:       ; %bb.0:
+; GFX10-MAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-MAD-NEXT:    s_clause 0x1
+; GFX10-MAD-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-MAD-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-MAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-MAD-NEXT:    v_madak_f32 v1, s4, v1, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-MAD-NEXT:    s_endpgm
+;
+; GFX11-MAD-LABEL: v_s_madak_f32:
+; GFX11-MAD:       ; %bb.0:
+; GFX11-MAD-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-MAD-NEXT:    s_clause 0x1
+; GFX11-MAD-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-MAD-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, s2, v1
+; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-MAD-NEXT:    s_nop 0
+; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT:    s_endpgm
+;
+; GFX940-FMA-LABEL: v_s_madak_f32:
+; GFX940-FMA:       ; %bb.0:
+; GFX940-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX940-FMA-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX940-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-FMA-NEXT:    v_fmac_f32_e32 v2, s4, v1
+; GFX940-FMA-NEXT:    global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-FMA-NEXT:    s_endpgm
+;
+; GFX10-FMA-LABEL: v_s_madak_f32:
+; GFX10-FMA:       ; %bb.0:
+; GFX10-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX10-FMA-NEXT:    s_clause 0x1
+; GFX10-FMA-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10-FMA-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-FMA-NEXT:    v_fmaak_f32 v1, s4, v1, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10-FMA-NEXT:    s_endpgm
+;
+; GFX11-FMA-LABEL: v_s_madak_f32:
+; GFX11-FMA:       ; %bb.0:
+; GFX11-FMA-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX11-FMA-NEXT:    s_clause 0x1
+; GFX11-FMA-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-FMA-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FMA-NEXT:    v_fmaak_f32 v1, s2, v1, 0x41200000
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-FMA-NEXT:    s_nop 0
+; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -179,34 +902,241 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a
   ret void
 }
 
-; GCN-LABEL: {{^}}s_s_madak_f32:
-; GCN-NOT: v_madak_f32
-; GFX8_9:  v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; GFX10PLUS-FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; GFX940-FMA: v_fmac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GFX11-MAD: v_mul_f32_e64 [[VMUL:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; GFX11-MAD: v_dual_mov_b32 {{v[0-9]+}}, 0 :: v_dual_add_f32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
 define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
+; GFX6-LABEL: s_s_madak_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0x41200000
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mac_f32_e32 v0, s2, v1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: s_s_madak_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    v_mac_f32_e32 v2, s2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: s_s_madak_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mac_f32_e32 v1, s2, v2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-MAD-LABEL: s_s_madak_f32:
+; GFX10-MAD:       ; %bb.0:
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, 0x41200000
+; GFX10-MAD-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT:    v_mac_f32_e64 v0, s2, s3
+; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-MAD-NEXT:    s_endpgm
+;
+; GFX11-MAD-LABEL: s_s_madak_f32:
+; GFX11-MAD:       ; %bb.0:
+; GFX11-MAD-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT:    v_mul_f32_e64 v0, s2, s3
+; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 0x41200000, v0
+; GFX11-MAD-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-MAD-NEXT:    s_nop 0
+; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT:    s_endpgm
+;
+; GFX940-FMA-LABEL: s_s_madak_f32:
+; GFX940-FMA:       ; %bb.0:
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-FMA-NEXT:    v_fmac_f32_e32 v1, s2, v2
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT:    s_endpgm
+;
+; GFX10-FMA-LABEL: s_s_madak_f32:
+; GFX10-FMA:       ; %bb.0:
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, 0x41200000
+; GFX10-FMA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT:    v_fmac_f32_e64 v0, s2, s3
+; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-FMA-NEXT:    s_endpgm
+;
+; GFX11-FMA-LABEL: s_s_madak_f32:
+; GFX11-FMA:       ; %bb.0:
+; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT:    v_dual_mov_b32 v0, 0x41200000 :: v_dual_mov_b32 v1, 0
+; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FMA-NEXT:    v_fmac_f32_e64 v0, s2, s3
+; GFX11-FMA-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-FMA-NEXT:    s_nop 0
+; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT:    s_endpgm
   %mul = fmul float %a, %b
   %madak = fadd float %mul, 10.0
   store float %madak, ptr addrspace(1) %out, align 4
   ret void
 }
 
-; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
-; GFX6:      buffer_load_dword [[VA:v[0-9]+]]
-; GFX6:      buffer_load_dword [[VB:v[0-9]+]]
-; GFX8PLUS:  {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
-; GFX8PLUS:  {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
-; GFX6_8_9:  v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
-; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
-; GFX10PLUS-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
-; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{s[0-9]+}}
-; GFX11-MAD: v_mul_f32_e64 [[VMUL:v[0-9]+]], |{{v[0-9]+}}|, {{v[0-9]+}}
-; GFX11-MAD: v_add_f32_e32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
-; GCN:       s_endpgm
 define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
+; GFX6-LABEL: no_madak_src0_modifier_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX6-NEXT:    s_mov_b32 s0, 0x41200000
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mad_f32 v2, |v2|, v3, s0
+; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: no_madak_src0_modifier_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
+; GFX8-NEXT:    s_mov_b32 s0, 0x41200000
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_f32 v2, |v5|, v2, s0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: no_madak_src0_modifier_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_mov_b32 s0, 0x41200000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mad_f32 v1, |v1|, v2, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-MAD-LABEL: no_madak_src0_modifier_f32:
+; GFX10-MAD:       ; %bb.0:
+; GFX10-MAD-NEXT:    s_clause 0x1
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT:    s_clause 0x1
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT:    v_mad_f32 v1, |v1|, v2, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT:    s_endpgm
+;
+; GFX11-MAD-LABEL: no_madak_src0_modifier_f32:
+; GFX11-MAD:       ; %bb.0:
+; GFX11-MAD-NEXT:    s_clause 0x1
+; GFX11-MAD-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT:    s_clause 0x1
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT:    v_mul_f32_e64 v1, |v1|, v2
+; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT:    s_nop 0
+; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT:    s_endpgm
+;
+; GFX940-FMA-LABEL: no_madak_src0_modifier_f32:
+; GFX940-FMA:       ; %bb.0:
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT:    s_mov_b32 s0, 0x41200000
+; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    v_fma_f32 v1, |v1|, v2, s0
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT:    s_endpgm
+;
+; GFX10-FMA-LABEL: no_madak_src0_modifier_f32:
+; GFX10-FMA:       ; %bb.0:
+; GFX10-FMA-NEXT:    s_clause 0x1
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT:    s_clause 0x1
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT:    v_fma_f32 v1, |v1|, v2, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT:    s_endpgm
+;
+; GFX11-FMA-LABEL: no_madak_src0_modifier_f32:
+; GFX11-FMA:       ; %bb.0:
+; GFX11-FMA-NEXT:    s_clause 0x1
+; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    s_clause 0x1
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT:    v_fma_f32 v1, |v1|, v2, 0x41200000
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT:    s_nop 0
+; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
   %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
@@ -223,19 +1153,143 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
   ret void
 }
 
-; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
-; GFX6:      buffer_load_dword [[VA:v[0-9]+]]
-; GFX6:      buffer_load_dword [[VB:v[0-9]+]]
-; GFX8PLUS:  {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
-; GFX8PLUS:  {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
-; GFX6_8_9:  v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
-; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
-; GFX10PLUS-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
-; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}}
-; GFX11-MAD: v_mul_f32_e64 [[VMUL:v[0-9]+]], {{v[0-9]+}}, |{{v[0-9]+}}|
-; GFX11-MAD: v_add_f32_e32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
-; GCN:       s_endpgm
 define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
+; GFX6-LABEL: no_madak_src1_modifier_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX6-NEXT:    s_mov_b32 s0, 0x41200000
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mad_f32 v2, v2, |v3|, s0
+; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: no_madak_src1_modifier_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
+; GFX8-NEXT:    s_mov_b32 s0, 0x41200000
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mad_f32 v2, v5, |v2|, s0
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: no_madak_src1_modifier_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_mov_b32 s0, 0x41200000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mad_f32 v1, v1, |v2|, s0
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-MAD-LABEL: no_madak_src1_modifier_f32:
+; GFX10-MAD:       ; %bb.0:
+; GFX10-MAD-NEXT:    s_clause 0x1
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT:    s_clause 0x1
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT:    v_mad_f32 v1, v1, |v2|, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT:    s_endpgm
+;
+; GFX11-MAD-LABEL: no_madak_src1_modifier_f32:
+; GFX11-MAD:       ; %bb.0:
+; GFX11-MAD-NEXT:    s_clause 0x1
+; GFX11-MAD-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT:    s_clause 0x1
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT:    v_mul_f32_e64 v1, v1, |v2|
+; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT:    s_nop 0
+; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT:    s_endpgm
+;
+; GFX940-FMA-LABEL: no_madak_src1_modifier_f32:
+; GFX940-FMA:       ; %bb.0:
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT:    s_mov_b32 s0, 0x41200000
+; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    v_fma_f32 v1, v1, |v2|, s0
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT:    s_endpgm
+;
+; GFX10-FMA-LABEL: no_madak_src1_modifier_f32:
+; GFX10-FMA:       ; %bb.0:
+; GFX10-FMA-NEXT:    s_clause 0x1
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT:    s_clause 0x1
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT:    v_fma_f32 v1, v1, |v2|, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT:    s_endpgm
+;
+; GFX11-FMA-LABEL: no_madak_src1_modifier_f32:
+; GFX11-FMA:       ; %bb.0:
+; GFX11-FMA-NEXT:    s_clause 0x1
+; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    s_clause 0x1
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT:    v_fma_f32 v1, v1, |v2|, 0x41200000
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT:    s_nop 0
+; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
   %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
@@ -255,22 +1309,193 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
 ; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10
 ; because the implicit immediate already uses the constant bus.
 ; On GFX10+ we can use two scalar operands.
-; GCN-LABEL: {{^}}madak_constant_bus_violation:
-; GCN:       {{buffer|flat|global}}_load_{{dword|b32}} [[VGPR:v[0-9]+]]
-; GCN:       s_load_{{dword|b32}} [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
-; MAD:       v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000
-; MAD:       v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5
-; GFX10-MAD: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
-; GFX10PLUS-FMA: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
-; GFX10-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
-; GFX10PLUS-FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
-; GFX940-FMA: v_fmac_f32_e64 [[MADAK:v[0-9]+]], [[SGPR0]], 0.5
-; GFX11-MAD: v_mul_f32_e64 [[VMUL:v[0-9]+]], [[SGPR0]], 0.5
-; GFX11-MAD: v_add_f32_e32 [[MADAK:v[0-9]+]], 0x42280000, [[VMUL]]
-; GCN:       v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
-; GFX6:      buffer_store_dword [[MUL]]
-; GFX8PLUS:  {{flat|global}}_store_{{dword|b32}} v[{{[0-9:]+}}], [[MUL]]
 define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
+; GFX6-LABEL: madak_constant_bus_violation:
+; GFX6:       ; %bb.0: ; %bb
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x9
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX6-NEXT:    s_cbranch_scc1 .LBB9_2
+; GFX6-NEXT:  ; %bb.1: ; %bb3
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:  .LBB9_2: ; %bb4
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x12
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0x42280000
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_mac_f32_e64 v1, s0, 0.5
+; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: madak_constant_bus_violation:
+; GFX8:       ; %bb.0: ; %bb
+; GFX8-NEXT:    s_load_dword s2, s[0:1], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX8-NEXT:    s_cbranch_scc1 .LBB9_2
+; GFX8-NEXT:  ; %bb.1: ; %bb3
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    flat_store_dword v[0:1], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:  .LBB9_2: ; %bb4
+; GFX8-NEXT:    flat_load_dword v0, v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x48
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x42280000
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mac_f32_e64 v1, s0, 0.5
+; GFX8-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX8-NEXT:    flat_store_dword v[0:1], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: madak_constant_bus_violation:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX9-NEXT:    s_cbranch_scc1 .LBB9_2
+; GFX9-NEXT:  ; %bb.1: ; %bb3
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:  .LBB9_2: ; %bb4
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x48
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x42280000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mac_f32_e64 v1, s0, 0.5
+; GFX9-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-NEXT:    global_store_dword v[0:1], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-MAD-LABEL: madak_constant_bus_violation:
+; GFX10-MAD:       ; %bb.0: ; %bb
+; GFX10-MAD-NEXT:    s_load_dword s2, s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-MAD-NEXT:    s_cbranch_scc1 .LBB9_2
+; GFX10-MAD-NEXT:  ; %bb.1: ; %bb3
+; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-MAD-NEXT:    global_store_dword v[0:1], v0, off
+; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-MAD-NEXT:  .LBB9_2: ; %bb4
+; GFX10-MAD-NEXT:    global_load_dword v0, v[0:1], off glc dlc
+; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT:    s_load_dword s0, s[0:1], 0x48
+; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-MAD-NEXT:    v_madak_f32 v1, 0.5, v1, 0x42280000
+; GFX10-MAD-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-MAD-NEXT:    global_store_dword v[0:1], v0, off
+; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-MAD-NEXT:    s_endpgm
+;
+; GFX11-MAD-LABEL: madak_constant_bus_violation:
+; GFX11-MAD:       ; %bb.0: ; %bb
+; GFX11-MAD-NEXT:    s_load_b32 s2, s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-MAD-NEXT:    s_cbranch_scc1 .LBB9_2
+; GFX11-MAD-NEXT:  ; %bb.1: ; %bb3
+; GFX11-MAD-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-MAD-NEXT:    global_store_b32 v[0:1], v0, off dlc
+; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-MAD-NEXT:  .LBB9_2: ; %bb4
+; GFX11-MAD-NEXT:    global_load_b32 v0, v[0:1], off glc dlc
+; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT:    s_load_b32 s0, s[0:1], 0x48
+; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT:    v_mul_f32_e64 v1, s0, 0.5
+; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x42280000, v1
+; GFX11-MAD-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-MAD-NEXT:    global_store_b32 v[0:1], v0, off dlc
+; GFX11-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-MAD-NEXT:    s_nop 0
+; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT:    s_endpgm
+;
+; GFX940-FMA-LABEL: madak_constant_bus_violation:
+; GFX940-FMA:       ; %bb.0: ; %bb
+; GFX940-FMA-NEXT:    s_load_dword s2, s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX940-FMA-NEXT:    s_cbranch_scc1 .LBB9_2
+; GFX940-FMA-NEXT:  ; %bb.1: ; %bb3
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-FMA-NEXT:    global_store_dword v[0:1], v0, off sc0 sc1
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:  .LBB9_2: ; %bb4
+; GFX940-FMA-NEXT:    global_load_dword v0, v[0:1], off sc0 sc1
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    s_load_dword s0, s[0:1], 0x48
+; GFX940-FMA-NEXT:    v_mov_b32_e32 v1, 0x42280000
+; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT:    v_fmac_f32_e64 v1, s0, 0.5
+; GFX940-FMA-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX940-FMA-NEXT:    global_store_dword v[0:1], v0, off sc0 sc1
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    s_endpgm
+;
+; GFX10-FMA-LABEL: madak_constant_bus_violation:
+; GFX10-FMA:       ; %bb.0: ; %bb
+; GFX10-FMA-NEXT:    s_load_dword s2, s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-FMA-NEXT:    s_cbranch_scc1 .LBB9_2
+; GFX10-FMA-NEXT:  ; %bb.1: ; %bb3
+; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-FMA-NEXT:    global_store_dword v[0:1], v0, off
+; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-FMA-NEXT:  .LBB9_2: ; %bb4
+; GFX10-FMA-NEXT:    global_load_dword v0, v[0:1], off glc dlc
+; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT:    s_load_dword s0, s[0:1], 0x48
+; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-FMA-NEXT:    v_fmaak_f32 v1, 0.5, v1, 0x42280000
+; GFX10-FMA-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX10-FMA-NEXT:    global_store_dword v[0:1], v0, off
+; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-FMA-NEXT:    s_endpgm
+;
+; GFX11-FMA-LABEL: madak_constant_bus_violation:
+; GFX11-FMA:       ; %bb.0: ; %bb
+; GFX11-FMA-NEXT:    s_load_b32 s2, s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-FMA-NEXT:    s_cbranch_scc1 .LBB9_2
+; GFX11-FMA-NEXT:  ; %bb.1: ; %bb3
+; GFX11-FMA-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT:    global_store_b32 v[0:1], v0, off dlc
+; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FMA-NEXT:  .LBB9_2: ; %bb4
+; GFX11-FMA-NEXT:    global_load_b32 v0, v[0:1], off glc dlc
+; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT:    s_load_b32 s0, s[0:1], 0x48
+; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    v_mov_b32_e32 v1, s0
+; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FMA-NEXT:    v_fmaak_f32 v1, 0.5, v1, 0x42280000
+; GFX11-FMA-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX11-FMA-NEXT:    global_store_b32 v[0:1], v0, off dlc
+; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FMA-NEXT:    s_nop 0
+; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT:    s_endpgm
 bb:
   %tmp = icmp eq i32 %arg1, 0
   br i1 %tmp, label %bb3, label %bb4

>From 00d577c0dd3a1bf65ba4815f920da158bfb38674 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 14 Nov 2023 14:52:41 +0000
Subject: [PATCH 2/2] [AMDGPU] Allow folding to FMAAK with SGPR and immediate
 operand on GFX10+

Allow foldImmediate to create instructions like:

  v_fmaak_f32 v0, s0, v0, 0x42000000

This instruction has two "scalar values": s0 and 0x42000000. On GFX10+
this is allowed. This fold was originally implemented before the
compiler supported GFX10, when all ASICs were limited to one scalar
value.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 68 ++++++++++---------
 .../test/CodeGen/AMDGPU/code-size-estimate.ll | 12 ++--
 llvm/test/CodeGen/AMDGPU/fma.f16.ll           |  8 +--
 llvm/test/CodeGen/AMDGPU/madak.ll             | 24 +++----
 4 files changed, 57 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 027b695c3bb1a74..043dfa8ab50116f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3497,43 +3497,45 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 
     // Added part is the constant: Use v_madak_{f16, f32}.
     if (Src2->isReg() && Src2->getReg() == Reg) {
-      // Not allowed to use constant bus for another operand.
-      // We can however allow an inline immediate as src0.
-      bool Src0Inlined = false;
-      if (Src0->isReg()) {
-        // Try to inline constant if possible.
-        // If the Def moves immediate and the use is single
-        // We are saving VGPR here.
-        MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
-        if (Def && Def->isMoveImmediate() &&
-          isInlineConstant(Def->getOperand(1)) &&
-          MRI->hasOneUse(Src0->getReg())) {
-          Src0->ChangeToImmediate(Def->getOperand(1).getImm());
-          Src0Inlined = true;
-        } else if ((Src0->getReg().isPhysical() &&
-                    (ST.getConstantBusLimit(Opc) <= 1 &&
-                     RI.isSGPRClass(RI.getPhysRegBaseClass(Src0->getReg())))) ||
-                   (Src0->getReg().isVirtual() &&
-                    (ST.getConstantBusLimit(Opc) <= 1 &&
-                     RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
-          return false;
+      if (ST.getConstantBusLimit(Opc) < 2) {
+        // Not allowed to use constant bus for another operand.
+        // We can however allow an inline immediate as src0.
+        bool Src0Inlined = false;
+        if (Src0->isReg()) {
+          // Try to inline constant if possible.
+          // If the Def moves immediate and the use is single
+          // We are saving VGPR here.
+          MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
+          if (Def && Def->isMoveImmediate() &&
+              isInlineConstant(Def->getOperand(1)) &&
+              MRI->hasOneUse(Src0->getReg())) {
+            Src0->ChangeToImmediate(Def->getOperand(1).getImm());
+            Src0Inlined = true;
+          } else if ((Src0->getReg().isPhysical() &&
+                      (ST.getConstantBusLimit(Opc) <= 1 &&
+                       RI.isSGPRClass(
+                           RI.getPhysRegBaseClass(Src0->getReg())))) ||
+                     (Src0->getReg().isVirtual() &&
+                      (ST.getConstantBusLimit(Opc) <= 1 &&
+                       RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
+            return false;
           // VGPR is okay as Src0 - fallthrough
-      }
+        }
 
-      if (Src1->isReg() && !Src0Inlined ) {
-        // We have one slot for inlinable constant so far - try to fill it
-        MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
-        if (Def && Def->isMoveImmediate() &&
-            isInlineConstant(Def->getOperand(1)) &&
-            MRI->hasOneUse(Src1->getReg()) &&
-            commuteInstruction(UseMI)) {
+        if (Src1->isReg() && !Src0Inlined) {
+          // We have one slot for inlinable constant so far - try to fill it
+          MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
+          if (Def && Def->isMoveImmediate() &&
+              isInlineConstant(Def->getOperand(1)) &&
+              MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI)) {
             Src0->ChangeToImmediate(Def->getOperand(1).getImm());
-        } else if ((Src1->getReg().isPhysical() &&
-                    RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) ||
-                   (Src1->getReg().isVirtual() &&
-                    RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
-          return false;
+          } else if ((Src1->getReg().isPhysical() &&
+                      RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) ||
+                     (Src1->getReg().isVirtual() &&
+                      RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+            return false;
           // VGPR is okay as Src1 - fallthrough
+        }
       }
 
       unsigned NewOpc =
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index 22823c8b6b0a8d5..b1222516786f1c3 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -323,15 +323,15 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
 ;
 ; GFX10-LABEL: s_fmaak_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0x43800000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x00,0x80,0x43]
-; GFX10-NEXT:    v_fmac_f32_e64 v0, s0, s1 ; encoding: [0x00,0x00,0x2b,0xd5,0x00,0x02,0x00,0x00]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s1 ; encoding: [0x01,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_fmaak_f32 v0, s0, v0, 0x43800000 ; encoding: [0x00,0x00,0x00,0x5a,0x00,0x00,0x80,0x43]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX1100-LABEL: s_fmaak_f32:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    v_mov_b32_e32 v0, 0x43800000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x00,0x80,0x43]
+; GFX1100-NEXT:    v_mov_b32_e32 v0, s1 ; encoding: [0x01,0x02,0x00,0x7e]
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX1100-NEXT:    v_fmac_f32_e64 v0, s0, s1 ; encoding: [0x00,0x00,0x2b,0xd5,0x00,0x02,0x00,0x00]
+; GFX1100-NEXT:    v_fmaak_f32 v0, s0, v0, 0x43800000 ; encoding: [0x00,0x00,0x00,0x5a,0x00,0x00,0x80,0x43]
 ; GFX1100-NEXT:    ; return to shader part epilog
 ;
 ; GFX1150-LABEL: s_fmaak_f32:
@@ -345,6 +345,6 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
 }
 
 ; GFX9: codeLenInByte = 20
-; GFX10: codeLenInByte = 16
-; GFX1100: codeLenInByte = 20
+; GFX10: codeLenInByte = 12
+; GFX1100: codeLenInByte = 16
 ; GFX1150: codeLenInByte = 16
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 4ed3abff0ad8515..7894f6bc6797d66 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -146,10 +146,10 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX10-SDAG-LABEL: test_D139469_f16:
 ; GFX10-SDAG:       ; %bb.0: ; %bb
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x211e
-; GFX10-SDAG-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
-; GFX10-SDAG-NEXT:    v_fmac_f16_e32 v1, 0x291e, v0
-; GFX10-SDAG-NEXT:    v_min_f16_e32 v0, v2, v1
+; GFX10-SDAG-NEXT:    s_movk_i32 s4, 0x291e
+; GFX10-SDAG-NEXT:    v_mul_f16_e32 v1, 0x291e, v0
+; GFX10-SDAG-NEXT:    v_fmaak_f16 v0, s4, v0, 0x211e
+; GFX10-SDAG-NEXT:    v_min_f16_e32 v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index a43e23e4aeeada1..310328ddb941856 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -943,10 +943,10 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ; GFX10-MAD-LABEL: s_s_madak_f32:
 ; GFX10-MAD:       ; %bb.0:
 ; GFX10-MAD-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, 0x41200000
 ; GFX10-MAD-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    v_mac_f32_e64 v0, s2, s3
+; GFX10-MAD-NEXT:    v_mov_b32_e32 v0, s3
+; GFX10-MAD-NEXT:    v_madak_f32 v0, s2, v0, 0x41200000
 ; GFX10-MAD-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-MAD-NEXT:    s_endpgm
 ;
@@ -976,20 +976,20 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
 ; GFX10-FMA-LABEL: s_s_madak_f32:
 ; GFX10-FMA:       ; %bb.0:
 ; GFX10-FMA-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, 0x41200000
 ; GFX10-FMA-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    v_fmac_f32_e64 v0, s2, s3
+; GFX10-FMA-NEXT:    v_mov_b32_e32 v0, s3
+; GFX10-FMA-NEXT:    v_fmaak_f32 v0, s2, v0, 0x41200000
 ; GFX10-FMA-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-FMA-NEXT:    s_endpgm
 ;
 ; GFX11-FMA-LABEL: s_s_madak_f32:
 ; GFX11-FMA:       ; %bb.0:
 ; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FMA-NEXT:    v_dual_mov_b32 v0, 0x41200000 :: v_dual_mov_b32 v1, 0
 ; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FMA-NEXT:    v_fmac_f32_e64 v0, s2, s3
+; GFX11-FMA-NEXT:    v_fmaak_f32 v0, s2, v0, 0x41200000
 ; GFX11-FMA-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-FMA-NEXT:    s_nop 0
 ; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1395,9 +1395,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl
 ; GFX10-MAD-NEXT:    global_load_dword v0, v[0:1], off glc dlc
 ; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-MAD-NEXT:    s_load_dword s0, s[0:1], 0x48
+; GFX10-MAD-NEXT:    v_mov_b32_e32 v1, 0.5
 ; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-MAD-NEXT:    v_madak_f32 v1, 0.5, v1, 0x42280000
+; GFX10-MAD-NEXT:    v_madak_f32 v1, s0, v1, 0x42280000
 ; GFX10-MAD-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX10-MAD-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX10-MAD-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1464,9 +1464,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl
 ; GFX10-FMA-NEXT:    global_load_dword v0, v[0:1], off glc dlc
 ; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FMA-NEXT:    s_load_dword s0, s[0:1], 0x48
+; GFX10-FMA-NEXT:    v_mov_b32_e32 v1, 0.5
 ; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-FMA-NEXT:    v_fmaak_f32 v1, 0.5, v1, 0x42280000
+; GFX10-FMA-NEXT:    v_fmaak_f32 v1, s0, v1, 0x42280000
 ; GFX10-FMA-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX10-FMA-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX10-FMA-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1486,10 +1486,10 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl
 ; GFX11-FMA-NEXT:    global_load_b32 v0, v[0:1], off glc dlc
 ; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FMA-NEXT:    s_load_b32 s0, s[0:1], 0x48
+; GFX11-FMA-NEXT:    v_mov_b32_e32 v1, 0.5
 ; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX11-FMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FMA-NEXT:    v_fmaak_f32 v1, 0.5, v1, 0x42280000
+; GFX11-FMA-NEXT:    v_fmaak_f32 v1, s0, v1, 0x42280000
 ; GFX11-FMA-NEXT:    v_mul_f32_e32 v0, v1, v0
 ; GFX11-FMA-NEXT:    global_store_b32 v[0:1], v0, off dlc
 ; GFX11-FMA-NEXT:    s_waitcnt_vscnt null, 0x0



More information about the llvm-commits mailing list