[llvm] [AMDGPU] Allow folding to FMAAK with SGPR and immediate operand on GFX10+ (PR #72266)

Tue Nov 14 07:13:51 PST 2023

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Jay Foad (jayfoad)

<details>
<summary>Changes</summary>

Allow foldImmediate to create instructions like:

  v_fmaak_f32 v0, s0, v0, 0x42000000

This instruction has two "scalar values": s0 and 0x42000000. On GFX10+
this is allowed. This fold was originally implemented before the
compiler supported GFX10, when all ASICs were limited to one scalar
value.


---

Patch is 76.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72266.diff


4 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+35-33) 
- (modified) llvm/test/CodeGen/AMDGPU/code-size-estimate.ll (+6-6) 
- (modified) llvm/test/CodeGen/AMDGPU/fma.f16.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/madak.ll (+1350-125) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 027b695c3bb1a74..043dfa8ab50116f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3497,43 +3497,45 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 
     // Added part is the constant: Use v_madak_{f16, f32}.
     if (Src2->isReg() && Src2->getReg() == Reg) {
-      // Not allowed to use constant bus for another operand.
-      // We can however allow an inline immediate as src0.
-      bool Src0Inlined = false;
-      if (Src0->isReg()) {
-        // Try to inline constant if possible.
-        // If the Def moves immediate and the use is single
-        // We are saving VGPR here.
-        MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
-        if (Def && Def->isMoveImmediate() &&
-          isInlineConstant(Def->getOperand(1)) &&
-          MRI->hasOneUse(Src0->getReg())) {
-          Src0->ChangeToImmediate(Def->getOperand(1).getImm());
-          Src0Inlined = true;
-        } else if ((Src0->getReg().isPhysical() &&
-                    (ST.getConstantBusLimit(Opc) <= 1 &&
-                     RI.isSGPRClass(RI.getPhysRegBaseClass(Src0->getReg())))) ||
-                   (Src0->getReg().isVirtual() &&
-                    (ST.getConstantBusLimit(Opc) <= 1 &&
-                     RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
-          return false;
+      if (ST.getConstantBusLimit(Opc) < 2) {
+        // Not allowed to use constant bus for another operand.
+        // We can however allow an inline immediate as src0.
+        bool Src0Inlined = false;
+        if (Src0->isReg()) {
+          // Try to inline constant if possible.
+          // If the Def moves immediate and the use is single
+          // We are saving VGPR here.
+          MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
+          if (Def && Def->isMoveImmediate() &&
+              isInlineConstant(Def->getOperand(1)) &&
+              MRI->hasOneUse(Src0->getReg())) {
+            Src0->ChangeToImmediate(Def->getOperand(1).getImm());
+            Src0Inlined = true;
+          } else if ((Src0->getReg().isPhysical() &&
+                      (ST.getConstantBusLimit(Opc) <= 1 &&
+                       RI.isSGPRClass(
+                           RI.getPhysRegBaseClass(Src0->getReg())))) ||
+                     (Src0->getReg().isVirtual() &&
+                      (ST.getConstantBusLimit(Opc) <= 1 &&
+                       RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
+            return false;
           // VGPR is okay as Src0 - fallthrough
-      }
+        }
 
-      if (Src1->isReg() && !Src0Inlined ) {
-        // We have one slot for inlinable constant so far - try to fill it
-        MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
-        if (Def && Def->isMoveImmediate() &&
-            isInlineConstant(Def->getOperand(1)) &&
-            MRI->hasOneUse(Src1->getReg()) &&
-            commuteInstruction(UseMI)) {
+        if (Src1->isReg() && !Src0Inlined) {
+          // We have one slot for inlinable constant so far - try to fill it
+          MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
+          if (Def && Def->isMoveImmediate() &&
+              isInlineConstant(Def->getOperand(1)) &&
+              MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI)) {
             Src0->ChangeToImmediate(Def->getOperand(1).getImm());
-        } else if ((Src1->getReg().isPhysical() &&
-                    RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) ||
-                   (Src1->getReg().isVirtual() &&
-                    RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
-          return false;
+          } else if ((Src1->getReg().isPhysical() &&
+                      RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) ||
+                     (Src1->getReg().isVirtual() &&
+                      RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+            return false;
           // VGPR is okay as Src1 - fallthrough
+        }
       }
 
       unsigned NewOpc =
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index 22823c8b6b0a8d5..b1222516786f1c3 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -323,15 +323,15 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
 ;
 ; GFX10-LABEL: s_fmaak_f32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0x43800000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x00,0x80,0x43]
-; GFX10-NEXT:    v_fmac_f32_e64 v0, s0, s1 ; encoding: [0x00,0x00,0x2b,0xd5,0x00,0x02,0x00,0x00]
+; GFX10-NEXT:    v_mov_b32_e32 v0, s1 ; encoding: [0x01,0x02,0x00,0x7e]
+; GFX10-NEXT:    v_fmaak_f32 v0, s0, v0, 0x43800000 ; encoding: [0x00,0x00,0x00,0x5a,0x00,0x00,0x80,0x43]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX1100-LABEL: s_fmaak_f32:
 ; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    v_mov_b32_e32 v0, 0x43800000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x00,0x80,0x43]
+; GFX1100-NEXT:    v_mov_b32_e32 v0, s1 ; encoding: [0x01,0x02,0x00,0x7e]
 ; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX1100-NEXT:    v_fmac_f32_e64 v0, s0, s1 ; encoding: [0x00,0x00,0x2b,0xd5,0x00,0x02,0x00,0x00]
+; GFX1100-NEXT:    v_fmaak_f32 v0, s0, v0, 0x43800000 ; encoding: [0x00,0x00,0x00,0x5a,0x00,0x00,0x80,0x43]
 ; GFX1100-NEXT:    ; return to shader part epilog
 ;
 ; GFX1150-LABEL: s_fmaak_f32:
@@ -345,6 +345,6 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
 }
 
 ; GFX9: codeLenInByte = 20
-; GFX10: codeLenInByte = 16
-; GFX1100: codeLenInByte = 20
+; GFX10: codeLenInByte = 12
+; GFX1100: codeLenInByte = 16
 ; GFX1150: codeLenInByte = 16
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 4ed3abff0ad8515..7894f6bc6797d66 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -146,10 +146,10 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX10-SDAG-LABEL: test_D139469_f16:
 ; GFX10-SDAG:       ; %bb.0: ; %bb
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v1, 0x211e
-; GFX10-SDAG-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
-; GFX10-SDAG-NEXT:    v_fmac_f16_e32 v1, 0x291e, v0
-; GFX10-SDAG-NEXT:    v_min_f16_e32 v0, v2, v1
+; GFX10-SDAG-NEXT:    s_movk_i32 s4, 0x291e
+; GFX10-SDAG-NEXT:    v_mul_f16_e32 v1, 0x291e, v0
+; GFX10-SDAG-NEXT:    v_fmaak_f16 v0, s4, v0, 0x211e
+; GFX10-SDAG-NEXT:    v_min_f16_e32 v0, v1, v0
 ; GFX10-SDAG-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 50a3bb187c4ac54..310328ddb941856 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -1,28 +1,150 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8PLUS,MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8PLUS,MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,GFX10-MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,FMA,GFX10PLUS-FMA %s
-; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8PLUS,FMA,GFX940-FMA %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,GFX11-MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,FMA,GFX10PLUS-FMA %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-MAD %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-MAD %s
+
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX940-FMA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX10-FMA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-FMA %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare float @llvm.fabs.f32(float) nounwind readnone
 
-; GCN-LABEL: {{^}}madak_f32:
-; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
-; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
-; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
-; MAD:   v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; FMA:   v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
 define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
+; GFX6-LABEL: madak_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, 0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX6-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_madak_f32 v2, v2, v3, 0x41200000
+; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: madak_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s6, v4
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_load_dword v5, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_madak_f32 v2, v5, v2, 0x41200000
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: madak_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_madak_f32 v1, v1, v2, 0x41200000
+; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-MAD-LABEL: madak_f32:
+; GFX10-MAD:       ; %bb.0:
+; GFX10-MAD-NEXT:    s_clause 0x1
+; GFX10-MAD-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT:    s_clause 0x1
+; GFX10-MAD-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT:    v_madak_f32 v1, v1, v2, 0x41200000
+; GFX10-MAD-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT:    s_endpgm
+;
+; GFX11-MAD-LABEL: madak_f32:
+; GFX11-MAD:       ; %bb.0:
+; GFX11-MAD-NEXT:    s_clause 0x1
+; GFX11-MAD-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT:    s_clause 0x1
+; GFX11-MAD-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX11-MAD-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT:    v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT:    s_nop 0
+; GFX11-MAD-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT:    s_endpgm
+;
+; GFX940-FMA-LABEL: madak_f32:
+; GFX940-FMA:       ; %bb.0:
+; GFX940-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX940-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX940-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT:    v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX940-FMA-NEXT:    global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT:    s_endpgm
+;
+; GFX10-FMA-LABEL: madak_f32:
+; GFX10-FMA:       ; %bb.0:
+; GFX10-FMA-NEXT:    s_clause 0x1
+; GFX10-FMA-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT:    s_clause 0x1
+; GFX10-FMA-NEXT:    global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT:    v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX10-FMA-NEXT:    global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT:    s_endpgm
+;
+; GFX11-FMA-LABEL: madak_f32:
+; GFX11-FMA:       ; %bb.0:
+; GFX11-FMA-NEXT:    s_clause 0x1
+; GFX11-FMA-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT:    s_clause 0x1
+; GFX11-FMA-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT:    v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX11-FMA-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT:    s_nop 0
+; GFX11-FMA-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT:    s_endpgm
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
   %in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
@@ -40,25 +162,183 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
 ; Make sure this is only folded with one use. This is a code size
 ; optimization and if we fold the immediate multiple times, we'll undo
 ; it.
-
-; GCN-LABEL: {{^}}madak_2_use_f32:
-; GFX9:         v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX6-DAG:     buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; GFX6-DAG:     buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX6-DAG:     buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; GFX8PLUS:     {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]],
-; GFX8PLUS:     {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]],
-; GFX8PLUS:     {{flat|global}}_load_{{dword|b32}} [[VC:v[0-9]+]],
-; GFX6-DAG:     v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX8-DAG:     v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; GFX10-MAD-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; FMA-DAG:      v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; MAD-DAG:      v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
-; GFX10PLUS-FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000
-; GFX940-FMA-DAG:v_fmac_f32_e32 [[VK]], [[VA]], [[VC]]
-; GCN:          s_endpgm
 define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
+; GFX6-LABEL: madak_2_use_f32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x41200000
+; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GFX6-NEXT:    v_madak_f32 v3, v2, v3, 0x41200000
+; GFX6-NEXT:    v_mac_f32_e32 v5, v2, v4
+; GFX6-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v5, v[0:1], s[4:7], 0 addr64 offset:4
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: madak_2_use_f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_load_dword v8, v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_load_dword v4, v[4:5] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v6
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x41200000
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_madak_f32 v6, v7, v8, 0x41200000
+; GFX8-NEXT:    v_mac_f32_e32 v5, v7, v4
+; GFX8-NEXT:    flat_store_dword v[0:1], v6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[2:3], v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: madak_2_use_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x41200000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_madak_f32 v2, v1, v2, 0x41200000
+; GFX9-NEXT:    v_mac_f32_e32 v4, v1, v3
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/72266