[llvm] [AMDGPU] Allow folding to FMAAK with SGPR and immediate operand on GFX10+ (PR #72266)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 14 07:13:51 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
Allow foldImmediate to create instructions like:
v_fmaak_f32 v0, s0, v0, 0x42000000
This instruction has two "scalar values": s0 and 0x42000000. On GFX10+
this is allowed. This fold was originally implemented before the
compiler supported GFX10, when all ASICs were limited to one scalar
value.
---
Patch is 76.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/72266.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+35-33)
- (modified) llvm/test/CodeGen/AMDGPU/code-size-estimate.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/fma.f16.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/madak.ll (+1350-125)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 027b695c3bb1a74..043dfa8ab50116f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3497,43 +3497,45 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// Added part is the constant: Use v_madak_{f16, f32}.
if (Src2->isReg() && Src2->getReg() == Reg) {
- // Not allowed to use constant bus for another operand.
- // We can however allow an inline immediate as src0.
- bool Src0Inlined = false;
- if (Src0->isReg()) {
- // Try to inline constant if possible.
- // If the Def moves immediate and the use is single
- // We are saving VGPR here.
- MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
- if (Def && Def->isMoveImmediate() &&
- isInlineConstant(Def->getOperand(1)) &&
- MRI->hasOneUse(Src0->getReg())) {
- Src0->ChangeToImmediate(Def->getOperand(1).getImm());
- Src0Inlined = true;
- } else if ((Src0->getReg().isPhysical() &&
- (ST.getConstantBusLimit(Opc) <= 1 &&
- RI.isSGPRClass(RI.getPhysRegBaseClass(Src0->getReg())))) ||
- (Src0->getReg().isVirtual() &&
- (ST.getConstantBusLimit(Opc) <= 1 &&
- RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
- return false;
+ if (ST.getConstantBusLimit(Opc) < 2) {
+ // Not allowed to use constant bus for another operand.
+ // We can however allow an inline immediate as src0.
+ bool Src0Inlined = false;
+ if (Src0->isReg()) {
+ // Try to inline constant if possible.
+ // If the Def moves immediate and the use is single
+ // We are saving VGPR here.
+ MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
+ if (Def && Def->isMoveImmediate() &&
+ isInlineConstant(Def->getOperand(1)) &&
+ MRI->hasOneUse(Src0->getReg())) {
+ Src0->ChangeToImmediate(Def->getOperand(1).getImm());
+ Src0Inlined = true;
+ } else if ((Src0->getReg().isPhysical() &&
+ (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRClass(
+ RI.getPhysRegBaseClass(Src0->getReg())))) ||
+ (Src0->getReg().isVirtual() &&
+ (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
+ return false;
// VGPR is okay as Src0 - fallthrough
- }
+ }
- if (Src1->isReg() && !Src0Inlined ) {
- // We have one slot for inlinable constant so far - try to fill it
- MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
- if (Def && Def->isMoveImmediate() &&
- isInlineConstant(Def->getOperand(1)) &&
- MRI->hasOneUse(Src1->getReg()) &&
- commuteInstruction(UseMI)) {
+ if (Src1->isReg() && !Src0Inlined) {
+ // We have one slot for inlinable constant so far - try to fill it
+ MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
+ if (Def && Def->isMoveImmediate() &&
+ isInlineConstant(Def->getOperand(1)) &&
+ MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI)) {
Src0->ChangeToImmediate(Def->getOperand(1).getImm());
- } else if ((Src1->getReg().isPhysical() &&
- RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) ||
- (Src1->getReg().isVirtual() &&
- RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
- return false;
+ } else if ((Src1->getReg().isPhysical() &&
+ RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) ||
+ (Src1->getReg().isVirtual() &&
+ RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+ return false;
// VGPR is okay as Src1 - fallthrough
+ }
}
unsigned NewOpc =
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index 22823c8b6b0a8d5..b1222516786f1c3 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -323,15 +323,15 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
;
; GFX10-LABEL: s_fmaak_f32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_mov_b32_e32 v0, 0x43800000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x00,0x80,0x43]
-; GFX10-NEXT: v_fmac_f32_e64 v0, s0, s1 ; encoding: [0x00,0x00,0x2b,0xd5,0x00,0x02,0x00,0x00]
+; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; encoding: [0x01,0x02,0x00,0x7e]
+; GFX10-NEXT: v_fmaak_f32 v0, s0, v0, 0x43800000 ; encoding: [0x00,0x00,0x00,0x5a,0x00,0x00,0x80,0x43]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX1100-LABEL: s_fmaak_f32:
; GFX1100: ; %bb.0:
-; GFX1100-NEXT: v_mov_b32_e32 v0, 0x43800000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x00,0x80,0x43]
+; GFX1100-NEXT: v_mov_b32_e32 v0, s1 ; encoding: [0x01,0x02,0x00,0x7e]
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX1100-NEXT: v_fmac_f32_e64 v0, s0, s1 ; encoding: [0x00,0x00,0x2b,0xd5,0x00,0x02,0x00,0x00]
+; GFX1100-NEXT: v_fmaak_f32 v0, s0, v0, 0x43800000 ; encoding: [0x00,0x00,0x00,0x5a,0x00,0x00,0x80,0x43]
; GFX1100-NEXT: ; return to shader part epilog
;
; GFX1150-LABEL: s_fmaak_f32:
@@ -345,6 +345,6 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
}
; GFX9: codeLenInByte = 20
-; GFX10: codeLenInByte = 16
-; GFX1100: codeLenInByte = 20
+; GFX10: codeLenInByte = 12
+; GFX1100: codeLenInByte = 16
; GFX1150: codeLenInByte = 16
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 4ed3abff0ad8515..7894f6bc6797d66 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -146,10 +146,10 @@ define i32 @test_D139469_f16(half %arg) {
; GFX10-SDAG-LABEL: test_D139469_f16:
; GFX10-SDAG: ; %bb.0: ; %bb
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX10-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX10-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX10-SDAG-NEXT: v_min_f16_e32 v0, v2, v1
+; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x291e
+; GFX10-SDAG-NEXT: v_mul_f16_e32 v1, 0x291e, v0
+; GFX10-SDAG-NEXT: v_fmaak_f16 v0, s4, v0, 0x211e
+; GFX10-SDAG-NEXT: v_min_f16_e32 v0, v1, v0
; GFX10-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 50a3bb187c4ac54..310328ddb941856 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -1,28 +1,150 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8PLUS,MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8PLUS,MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,GFX10-MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,FMA,GFX10PLUS-FMA %s
-; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8PLUS,FMA,GFX940-FMA %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,GFX11-MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,FMA,GFX10PLUS-FMA %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-MAD %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-MAD %s
+
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX940-FMA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX10-FMA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-FMA %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.fabs.f32(float) nounwind readnone
-; GCN-LABEL: {{^}}madak_f32:
-; GFX6: buffer_load_dword [[VA:v[0-9]+]]
-; GFX6: buffer_load_dword [[VB:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
-; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
-; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
+; GFX6-LABEL: madak_f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, 0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
+; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_madak_f32 v2, v2, v3, 0x41200000
+; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: madak_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_madak_f32 v2, v5, v2, 0x41200000
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: madak_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_madak_f32 v1, v1, v2, 0x41200000
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-MAD-LABEL: madak_f32:
+; GFX10-MAD: ; %bb.0:
+; GFX10-MAD-NEXT: s_clause 0x1
+; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT: s_clause 0x1
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v2, 0x41200000
+; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT: s_endpgm
+;
+; GFX11-MAD-LABEL: madak_f32:
+; GFX11-MAD: ; %bb.0:
+; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT: s_nop 0
+; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT: s_endpgm
+;
+; GFX940-FMA-LABEL: madak_f32:
+; GFX940-FMA: ; %bb.0:
+; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT: s_endpgm
+;
+; GFX10-FMA-LABEL: madak_f32:
+; GFX10-FMA: ; %bb.0:
+; GFX10-FMA-NEXT: s_clause 0x1
+; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT: s_clause 0x1
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: madak_f32:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_nop 0
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
%in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
@@ -40,25 +162,183 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
; Make sure this is only folded with one use. This is a code size
; optimization and if we fold the immediate multiple times, we'll undo
; it.
-
-; GCN-LABEL: {{^}}madak_2_use_f32:
-; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]],
-; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]],
-; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VC:v[0-9]+]],
-; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX8-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; GFX10-MAD-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
-; GFX10PLUS-FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000
-; GFX940-FMA-DAG:v_fmac_f32_e32 [[VK]], [[VA]], [[VC]]
-; GCN: s_endpgm
define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
+; GFX6-LABEL: madak_2_use_f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v5, 0x41200000
+; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GFX6-NEXT: v_madak_f32 v3, v2, v3, 0x41200000
+; GFX6-NEXT: v_mac_f32_e32 v5, v2, v4
+; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v5, v[0:1], s[4:7], 0 addr64 offset:4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: madak_2_use_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 8, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v7, v[0:1] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_load_dword v8, v[2:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_load_dword v4, v[4:5] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x41200000
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_madak_f32 v6, v7, v8, 0x41200000
+; GFX8-NEXT: v_mac_f32_e32 v5, v7, v4
+; GFX8-NEXT: flat_store_dword v[0:1], v6
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_dword v[2:3], v5
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: madak_2_use_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_madak_f32 v2, v1, v2, 0x41200000
+; GFX9-NEXT: v_mac_f32_e32 v4, v1, v3
+; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/72266
More information about the llvm-commits
mailing list