[llvm] [AMDGPU] Allow folding to FMAAK with SGPR and immediate operand on GFX10+ (PR #72266)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 14 07:13:22 PST 2023
https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/72266
Allow foldImmediate to create instructions like:
v_fmaak_f32 v0, s0, v0, 0x42000000
This instruction has two "scalar values": s0 and 0x42000000. On GFX10+
this is allowed. This fold was originally implemented before the
compiler supported GFX10, when all ASICs were limited to one scalar
value.
>From 7d1344ee23e69a4cf626ad5b459adc5544f2e105 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 14 Nov 2023 14:51:42 +0000
Subject: [PATCH 1/2] [AMDGPU] Generate checks in madak.ll
---
llvm/test/CodeGen/AMDGPU/madak.ll | 1475 ++++++++++++++++++++++++++---
1 file changed, 1350 insertions(+), 125 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 50a3bb187c4ac54..a43e23e4aeeada1 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -1,28 +1,150 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8PLUS,MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8PLUS,MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,GFX10-MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,FMA,GFX10PLUS-FMA %s
-; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8PLUS,FMA,GFX940-FMA %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,GFX11-MAD %s
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX8PLUS,FMA,GFX10PLUS-FMA %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-MAD %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-MAD %s
+
+; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX940-FMA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX10-FMA %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefixes=GFX11-FMA %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.fabs.f32(float) nounwind readnone
-; GCN-LABEL: {{^}}madak_f32:
-; GFX6: buffer_load_dword [[VA:v[0-9]+]]
-; GFX6: buffer_load_dword [[VB:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
-; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
-; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
+; GFX6-LABEL: madak_f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, 0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
+; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_madak_f32 v2, v2, v3, 0x41200000
+; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: madak_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_madak_f32 v2, v5, v2, 0x41200000
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: madak_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_madak_f32 v1, v1, v2, 0x41200000
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-MAD-LABEL: madak_f32:
+; GFX10-MAD: ; %bb.0:
+; GFX10-MAD-NEXT: s_clause 0x1
+; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT: s_clause 0x1
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v2, 0x41200000
+; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT: s_endpgm
+;
+; GFX11-MAD-LABEL: madak_f32:
+; GFX11-MAD: ; %bb.0:
+; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT: s_nop 0
+; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT: s_endpgm
+;
+; GFX940-FMA-LABEL: madak_f32:
+; GFX940-FMA: ; %bb.0:
+; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT: s_endpgm
+;
+; GFX10-FMA-LABEL: madak_f32:
+; GFX10-FMA: ; %bb.0:
+; GFX10-FMA-NEXT: s_clause 0x1
+; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT: s_clause 0x1
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: madak_f32:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_nop 0
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
%in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
@@ -40,25 +162,183 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
; Make sure this is only folded with one use. This is a code size
; optimization and if we fold the immediate multiple times, we'll undo
; it.
-
-; GCN-LABEL: {{^}}madak_2_use_f32:
-; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
-; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]],
-; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]],
-; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VC:v[0-9]+]],
-; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX8-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; GFX10-MAD-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
-; GFX10PLUS-FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000
-; GFX940-FMA-DAG:v_fmac_f32_e32 [[VK]], [[VA]], [[VC]]
-; GCN: s_endpgm
define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 {
+; GFX6-LABEL: madak_2_use_f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v5, 0x41200000
+; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GFX6-NEXT: v_madak_f32 v3, v2, v3, 0x41200000
+; GFX6-NEXT: v_mac_f32_e32 v5, v2, v4
+; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_store_dword v5, v[0:1], s[4:7], 0 addr64 offset:4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: madak_2_use_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 8, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v7, v[0:1] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_load_dword v8, v[2:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_load_dword v4, v[4:5] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; GFX8-NEXT: v_mov_b32_e32 v5, 0x41200000
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_madak_f32 v6, v7, v8, 0x41200000
+; GFX8-NEXT: v_mac_f32_e32 v5, v7, v4
+; GFX8-NEXT: flat_store_dword v[0:1], v6
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_dword v[2:3], v5
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: madak_2_use_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_madak_f32 v2, v1, v2, 0x41200000
+; GFX9-NEXT: v_mac_f32_e32 v4, v1, v3
+; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v4, s[2:3] offset:4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-MAD-LABEL: madak_2_use_f32:
+; GFX10-MAD: ; %bb.0:
+; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT: v_madak_f32 v2, v1, v2, 0x41200000
+; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v3, 0x41200000
+; GFX10-MAD-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] offset:4
+; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-MAD-NEXT: s_endpgm
+;
+; GFX11-MAD-LABEL: madak_2_use_f32:
+; GFX11-MAD: ; %bb.0:
+; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
+; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc
+; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT: v_mul_f32_e32 v2, v1, v2
+; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT: v_dual_add_f32 v1, 0x41200000, v1 :: v_dual_add_f32 v2, 0x41200000, v2
+; GFX11-MAD-NEXT: global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc
+; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-MAD-NEXT: s_nop 0
+; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT: s_endpgm
+;
+; GFX940-FMA-LABEL: madak_2_use_f32:
+; GFX940-FMA: ; %bb.0:
+; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT: v_mov_b32_e32 v4, 0x41200000
+; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000
+; GFX940-FMA-NEXT: v_fmac_f32_e32 v4, v1, v3
+; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: s_endpgm
+;
+; GFX10-FMA-LABEL: madak_2_use_f32:
+; GFX10-FMA: ; %bb.0:
+; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc
+; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000
+; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v3, 0x41200000
+; GFX10-FMA-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] offset:4
+; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-FMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: madak_2_use_f32:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000
+; GFX11-FMA-NEXT: v_fmaak_f32 v1, v1, v3, 0x41200000
+; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc
+; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FMA-NEXT: s_nop 0
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -82,12 +362,111 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad
ret void
}
-; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
-; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
-; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
-; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
-; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a) #0 {
+; GFX6-LABEL: madak_m_inline_imm_f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_madak_f32 v2, 4.0, v2, 0x41200000
+; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: madak_m_inline_imm_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_madak_f32 v2, 4.0, v3, 0x41200000
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: madak_m_inline_imm_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-MAD-LABEL: madak_m_inline_imm_f32:
+; GFX10-MAD: ; %bb.0:
+; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000
+; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-MAD-NEXT: s_endpgm
+;
+; GFX11-MAD-LABEL: madak_m_inline_imm_f32:
+; GFX11-MAD: ; %bb.0:
+; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT: v_mul_f32_e32 v1, 4.0, v1
+; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-MAD-NEXT: s_nop 0
+; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT: s_endpgm
+;
+; GFX940-FMA-LABEL: madak_m_inline_imm_f32:
+; GFX940-FMA: ; %bb.0:
+; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000
+; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT: s_endpgm
+;
+; GFX10-FMA-LABEL: madak_m_inline_imm_f32:
+; GFX10-FMA: ; %bb.0:
+; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000
+; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-FMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: madak_m_inline_imm_f32:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FMA-NEXT: s_nop 0
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
%out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -102,20 +481,139 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out,
; Make sure nothing weird happens with a value that is also allowed as
; an inline immediate.
-
-; GCN-LABEL: {{^}}madak_inline_imm_f32:
-; GFX6: buffer_load_dword [[VA:v[0-9]+]]
-; GFX6: buffer_load_dword [[VB:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
-; GFX10PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
-; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
-; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
-; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
+; GFX6-LABEL: madak_inline_imm_f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, 0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
+; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mad_f32 v2, v2, v3, 4.0
+; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: madak_inline_imm_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_f32 v2, v5, v2, 4.0
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: madak_inline_imm_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mad_f32 v1, v1, v2, 4.0
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-MAD-LABEL: madak_inline_imm_f32:
+; GFX10-MAD: ; %bb.0:
+; GFX10-MAD-NEXT: s_clause 0x1
+; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT: s_clause 0x1
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT: v_mad_f32 v1, v1, v2, 4.0
+; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT: s_endpgm
+;
+; GFX11-MAD-LABEL: madak_inline_imm_f32:
+; GFX11-MAD: ; %bb.0:
+; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT: v_add_f32_e32 v1, 4.0, v1
+; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT: s_nop 0
+; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT: s_endpgm
+;
+; GFX940-FMA-LABEL: madak_inline_imm_f32:
+; GFX940-FMA: ; %bb.0:
+; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0
+; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT: s_endpgm
+;
+; GFX10-FMA-LABEL: madak_inline_imm_f32:
+; GFX10-FMA: ; %bb.0:
+; GFX10-FMA-NEXT: s_clause 0x1
+; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT: s_clause 0x1
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0
+; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: madak_inline_imm_f32:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_nop 0
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
%in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
@@ -131,18 +629,125 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
}
; We can't use an SGPR when forming madak
-; GCN-LABEL: {{^}}s_v_madak_f32:
-; GCN-DAG: s_load_{{dword|b32}} [[SB:s[0-9]+]]
-; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}}{{(_addtid)?}} [[VA:v[0-9]+]]
-; GCN-NOT: v_madak_f32
-; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
-; GFX10PLUS-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
-; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]]
-; GFX11-MAD: v_mul_f32_e32 [[VMUL:v[0-9]+]], [[SB]], [[VA]]
-; GFX11-MAD: v_add_f32_e32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, float %b) #0 {
+; GFX6-LABEL: s_v_madak_f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, 0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000
+; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mac_f32_e32 v3, s8, v2
+; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: s_v_madak_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mac_f32_e32 v2, s0, v3
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_v_madak_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mac_f32_e32 v2, s2, v1
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-MAD-LABEL: s_v_madak_f32:
+; GFX10-MAD: ; %bb.0:
+; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x34
+; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT: v_madak_f32 v1, s0, v1, 0x41200000
+; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT: s_endpgm
+;
+; GFX11-MAD-LABEL: s_v_madak_f32:
+; GFX11-MAD: ; %bb.0:
+; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s0, v1
+; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT: s_nop 0
+; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT: s_endpgm
+;
+; GFX940-FMA-LABEL: s_v_madak_f32:
+; GFX940-FMA: ; %bb.0:
+; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT: s_load_dword s2, s[0:1], 0x34
+; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000
+; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s2, v1
+; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT: s_endpgm
+;
+; GFX10-FMA-LABEL: s_v_madak_f32:
+; GFX10-FMA: ; %bb.0:
+; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x34
+; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x41200000
+; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: s_v_madak_f32:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x34
+; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x41200000
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_nop 0
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
%out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -155,18 +760,136 @@ define amdgpu_kernel void @s_v_madak_f32(ptr addrspace(1) noalias %out, ptr addr
ret void
}
-; GCN-LABEL: @v_s_madak_f32
-; GCN-DAG: s_load_{{dword|b32}} [[SB:s[0-9]+]]
-; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: {{buffer|flat|global}}_load_{{dword|b32}}{{(_addtid)?}} [[VA:v[0-9]+]]
-; GFX6_8_9-NOT: v_madak_f32
-; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
-; GFX10PLUS-FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
-; GFX940-FMA: v_fmac_f32_e32 v{{[0-9]+}}, [[SB]], [[VA]]
-; GFX11-MAD: v_mul_f32_e32 [[VMUL:v[0-9]+]], [[SB]], [[VA]]
-; GFX11-MAD: v_add_f32_e32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a, ptr addrspace(1) noalias %in.b) #0 {
+; GFX6-LABEL: v_s_madak_f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb
+; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX6-NEXT: v_mov_b32_e32 v3, 0x41200000
+; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mac_f32_e32 v3, s2, v2
+; GFX6-NEXT: buffer_store_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: v_s_madak_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mac_f32_e32 v2, s0, v3
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_s_madak_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0x41200000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mac_f32_e32 v2, s4, v1
+; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-MAD-LABEL: v_s_madak_f32:
+; GFX10-MAD: ; %bb.0:
+; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-MAD-NEXT: s_clause 0x1
+; GFX10-MAD-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10-MAD-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-MAD-NEXT: v_madak_f32 v1, s4, v1, 0x41200000
+; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-MAD-NEXT: s_endpgm
+;
+; GFX11-MAD-LABEL: v_s_madak_f32:
+; GFX11-MAD: ; %bb.0:
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s2, v1
+; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-MAD-NEXT: s_nop 0
+; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT: s_endpgm
+;
+; GFX940-FMA-LABEL: v_s_madak_f32:
+; GFX940-FMA: ; %bb.0:
+; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT: v_mov_b32_e32 v2, 0x41200000
+; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX940-FMA-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-FMA-NEXT: v_fmac_f32_e32 v2, s4, v1
+; GFX940-FMA-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
+; GFX940-FMA-NEXT: s_endpgm
+;
+; GFX10-FMA-LABEL: v_s_madak_f32:
+; GFX10-FMA: ; %bb.0:
+; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-FMA-NEXT: s_clause 0x1
+; GFX10-FMA-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10-FMA-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000
+; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX10-FMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: v_s_madak_f32:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FMA-NEXT: s_nop 0
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
%out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -179,34 +902,241 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a
ret void
}
-; GCN-LABEL: {{^}}s_s_madak_f32:
-; GCN-NOT: v_madak_f32
-; GFX8_9: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; GFX10PLUS-FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-; GFX940-FMA: v_fmac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-; GFX11-MAD: v_mul_f32_e64 [[VMUL:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
-; GFX11-MAD: v_dual_mov_b32 {{v[0-9]+}}, 0 :: v_dual_add_f32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float %b) #0 {
+; GFX6-LABEL: s_s_madak_f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT: v_mov_b32_e32 v0, 0x41200000
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v1, s3
+; GFX6-NEXT: s_mov_b32 s4, s0
+; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_mac_f32_e32 v0, s2, v1
+; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: s_s_madak_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mac_f32_e32 v2, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: s_s_madak_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_mac_f32_e32 v1, s2, v2
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-MAD-LABEL: s_s_madak_f32:
+; GFX10-MAD: ; %bb.0:
+; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-MAD-NEXT: v_mov_b32_e32 v0, 0x41200000
+; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT: v_mac_f32_e64 v0, s2, s3
+; GFX10-MAD-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-MAD-NEXT: s_endpgm
+;
+; GFX11-MAD-LABEL: s_s_madak_f32:
+; GFX11-MAD: ; %bb.0:
+; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT: v_mul_f32_e64 v0, s2, s3
+; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 0x41200000, v0
+; GFX11-MAD-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-MAD-NEXT: s_nop 0
+; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT: s_endpgm
+;
+; GFX940-FMA-LABEL: s_s_madak_f32:
+; GFX940-FMA: ; %bb.0:
+; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x41200000
+; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s3
+; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s2, v2
+; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-FMA-NEXT: s_endpgm
+;
+; GFX10-FMA-LABEL: s_s_madak_f32:
+; GFX10-FMA: ; %bb.0:
+; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FMA-NEXT: v_mov_b32_e32 v0, 0x41200000
+; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT: v_fmac_f32_e64 v0, s2, s3
+; GFX10-FMA-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-FMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: s_s_madak_f32:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FMA-NEXT: v_dual_mov_b32 v0, 0x41200000 :: v_dual_mov_b32 v1, 0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FMA-NEXT: v_fmac_f32_e64 v0, s2, s3
+; GFX11-FMA-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-FMA-NEXT: s_nop 0
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
%mul = fmul float %a, %b
%madak = fadd float %mul, 10.0
store float %madak, ptr addrspace(1) %out, align 4
ret void
}
-; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
-; GFX6: buffer_load_dword [[VA:v[0-9]+]]
-; GFX6: buffer_load_dword [[VB:v[0-9]+]]
-; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
-; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
-; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
-; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
-; GFX10PLUS-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
-; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{s[0-9]+}}
-; GFX11-MAD: v_mul_f32_e64 [[VMUL:v[0-9]+]], |{{v[0-9]+}}|, {{v[0-9]+}}
-; GFX11-MAD: v_add_f32_e32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
-; GCN: s_endpgm
define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
+; GFX6-LABEL: no_madak_src0_modifier_f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, 0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
+; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX6-NEXT: s_mov_b32 s0, 0x41200000
+; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mad_f32 v2, |v2|, v3, s0
+; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: no_madak_src0_modifier_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; GFX8-NEXT: s_mov_b32 s0, 0x41200000
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_f32 v2, |v5|, v2, s0
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: no_madak_src0_modifier_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s0, 0x41200000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mad_f32 v1, |v1|, v2, s0
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-MAD-LABEL: no_madak_src0_modifier_f32:
+; GFX10-MAD: ; %bb.0:
+; GFX10-MAD-NEXT: s_clause 0x1
+; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT: s_clause 0x1
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT: v_mad_f32 v1, |v1|, v2, 0x41200000
+; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT: s_endpgm
+;
+; GFX11-MAD-LABEL: no_madak_src0_modifier_f32:
+; GFX11-MAD: ; %bb.0:
+; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT: v_mul_f32_e64 v1, |v1|, v2
+; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT: s_nop 0
+; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT: s_endpgm
+;
+; GFX940-FMA-LABEL: no_madak_src0_modifier_f32:
+; GFX940-FMA: ; %bb.0:
+; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000
+; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: v_fma_f32 v1, |v1|, v2, s0
+; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT: s_endpgm
+;
+; GFX10-FMA-LABEL: no_madak_src0_modifier_f32:
+; GFX10-FMA: ; %bb.0:
+; GFX10-FMA-NEXT: s_clause 0x1
+; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT: s_clause 0x1
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000
+; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: no_madak_src0_modifier_f32:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_nop 0
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
%in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
@@ -223,19 +1153,143 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
ret void
}
-; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
-; GFX6: buffer_load_dword [[VA:v[0-9]+]]
-; GFX6: buffer_load_dword [[VB:v[0-9]+]]
-; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VB:v[0-9]+]]
-; GFX8PLUS: {{flat|global}}_load_{{dword|b32}} [[VA:v[0-9]+]]
-; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
-; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
-; GFX10PLUS-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
-; GFX940-FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}}
-; GFX11-MAD: v_mul_f32_e64 [[VMUL:v[0-9]+]], {{v[0-9]+}}, |{{v[0-9]+}}|
-; GFX11-MAD: v_add_f32_e32 {{v[0-9]+}}, 0x41200000, [[VMUL]]
-; GCN: s_endpgm
define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in.a, ptr addrspace(1) noalias %in.b) #0 {
+; GFX6-LABEL: no_madak_src1_modifier_f32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, 0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3]
+; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64
+; GFX6-NEXT: s_mov_b32 s0, 0x41200000
+; GFX6-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mad_f32 v2, v2, |v3|, s0
+; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: no_madak_src1_modifier_f32:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: flat_load_dword v2, v[2:3]
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; GFX8-NEXT: s_mov_b32 s0, 0x41200000
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mad_f32 v2, v5, |v2|, s0
+; GFX8-NEXT: flat_store_dword v[0:1], v2
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: no_madak_src1_modifier_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s0, 0x41200000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mad_f32 v1, v1, |v2|, s0
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-MAD-LABEL: no_madak_src1_modifier_f32:
+; GFX10-MAD: ; %bb.0:
+; GFX10-MAD-NEXT: s_clause 0x1
+; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-MAD-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT: s_clause 0x1
+; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT: v_mad_f32 v1, v1, |v2|, 0x41200000
+; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-MAD-NEXT: s_endpgm
+;
+; GFX11-MAD-LABEL: no_madak_src1_modifier_f32:
+; GFX11-MAD: ; %bb.0:
+; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT: s_clause 0x1
+; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT: v_mul_f32_e64 v1, v1, |v2|
+; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
+; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-MAD-NEXT: s_nop 0
+; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT: s_endpgm
+;
+; GFX940-FMA-LABEL: no_madak_src1_modifier_f32:
+; GFX940-FMA: ; %bb.0:
+; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX940-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX940-FMA-NEXT: s_mov_b32 s0, 0x41200000
+; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: v_fma_f32 v1, v1, |v2|, s0
+; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1
+; GFX940-FMA-NEXT: s_endpgm
+;
+; GFX10-FMA-LABEL: no_madak_src1_modifier_f32:
+; GFX10-FMA: ; %bb.0:
+; GFX10-FMA-NEXT: s_clause 0x1
+; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-FMA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT: s_clause 0x1
+; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000
+; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-FMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: no_madak_src1_modifier_f32:
+; GFX11-FMA: ; %bb.0:
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_clause 0x1
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-FMA-NEXT: s_nop 0
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, ptr addrspace(1) %in.a, i32 %tid
%in.b.gep = getelementptr float, ptr addrspace(1) %in.b, i32 %tid
@@ -255,22 +1309,193 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10
; because the implicit immediate already uses the constant bus.
; On GFX10+ we can use two scalar operands.
-; GCN-LABEL: {{^}}madak_constant_bus_violation:
-; GCN: {{buffer|flat|global}}_load_{{dword|b32}} [[VGPR:v[0-9]+]]
-; GCN: s_load_{{dword|b32}} [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
-; MAD: v_mov_b32_e32 [[MADAK:v[0-9]+]], 0x42280000
-; MAD: v_mac_f32_e64 [[MADAK]], [[SGPR0]], 0.5
-; GFX10-MAD: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
-; GFX10PLUS-FMA: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
-; GFX10-MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
-; GFX10PLUS-FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
-; GFX940-FMA: v_fmac_f32_e64 [[MADAK:v[0-9]+]], [[SGPR0]], 0.5
-; GFX11-MAD: v_mul_f32_e64 [[VMUL:v[0-9]+]], [[SGPR0]], 0.5
-; GFX11-MAD: v_add_f32_e32 [[MADAK:v[0-9]+]], 0x42280000, [[VMUL]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
-; GFX6: buffer_store_dword [[MUL]]
-; GFX8PLUS: {{flat|global}}_store_{{dword|b32}} v[{{[0-9:]+}}], [[MUL]]
define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
+; GFX6-LABEL: madak_constant_bus_violation:
+; GFX6: ; %bb.0: ; %bb
+; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_cmp_lg_u32 s2, 0
+; GFX6-NEXT: s_cbranch_scc1 .LBB9_2
+; GFX6-NEXT: ; %bb.1: ; %bb3
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: v_mov_b32_e32 v0, 0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: .LBB9_2: ; %bb4
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: s_load_dword s0, s[0:1], 0x12
+; GFX6-NEXT: v_mov_b32_e32 v1, 0x42280000
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: v_mac_f32_e64 v1, s0, 0.5
+; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: s_endpgm
+;
+; GFX8-LABEL: madak_constant_bus_violation:
+; GFX8: ; %bb.0: ; %bb
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_cmp_lg_u32 s2, 0
+; GFX8-NEXT: s_cbranch_scc1 .LBB9_2
+; GFX8-NEXT: ; %bb.1: ; %bb3
+; GFX8-NEXT: v_mov_b32_e32 v0, 0
+; GFX8-NEXT: flat_store_dword v[0:1], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: .LBB9_2: ; %bb4
+; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_load_dword s0, s[0:1], 0x48
+; GFX8-NEXT: v_mov_b32_e32 v1, 0x42280000
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mac_f32_e64 v1, s0, 0.5
+; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX8-NEXT: flat_store_dword v[0:1], v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: madak_constant_bus_violation:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_cmp_lg_u32 s2, 0
+; GFX9-NEXT: s_cbranch_scc1 .LBB9_2
+; GFX9-NEXT: ; %bb.1: ; %bb3
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: .LBB9_2: ; %bb4
+; GFX9-NEXT: global_load_dword v0, v[0:1], off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_load_dword s0, s[0:1], 0x48
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mac_f32_e64 v1, s0, 0.5
+; GFX9-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX9-NEXT: global_store_dword v[0:1], v0, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-MAD-LABEL: madak_constant_bus_violation:
+; GFX10-MAD: ; %bb.0: ; %bb
+; GFX10-MAD-NEXT: s_load_dword s2, s[0:1], 0x24
+; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-MAD-NEXT: s_cbranch_scc1 .LBB9_2
+; GFX10-MAD-NEXT: ; %bb.1: ; %bb3
+; GFX10-MAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-MAD-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-MAD-NEXT: .LBB9_2: ; %bb4
+; GFX10-MAD-NEXT: global_load_dword v0, v[0:1], off glc dlc
+; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x48
+; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-MAD-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-MAD-NEXT: v_madak_f32 v1, 0.5, v1, 0x42280000
+; GFX10-MAD-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX10-MAD-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-MAD-NEXT: s_endpgm
+;
+; GFX11-MAD-LABEL: madak_constant_bus_violation:
+; GFX11-MAD: ; %bb.0: ; %bb
+; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x24
+; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-MAD-NEXT: s_cbranch_scc1 .LBB9_2
+; GFX11-MAD-NEXT: ; %bb.1: ; %bb3
+; GFX11-MAD-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-MAD-NEXT: global_store_b32 v[0:1], v0, off dlc
+; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-MAD-NEXT: .LBB9_2: ; %bb4
+; GFX11-MAD-NEXT: global_load_b32 v0, v[0:1], off glc dlc
+; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
+; GFX11-MAD-NEXT: s_load_b32 s0, s[0:1], 0x48
+; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-MAD-NEXT: v_mul_f32_e64 v1, s0, 0.5
+; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x42280000, v1
+; GFX11-MAD-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX11-MAD-NEXT: global_store_b32 v[0:1], v0, off dlc
+; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-MAD-NEXT: s_nop 0
+; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-MAD-NEXT: s_endpgm
+;
+; GFX940-FMA-LABEL: madak_constant_bus_violation:
+; GFX940-FMA: ; %bb.0: ; %bb
+; GFX940-FMA-NEXT: s_load_dword s2, s[0:1], 0x24
+; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT: s_cmp_lg_u32 s2, 0
+; GFX940-FMA-NEXT: s_cbranch_scc1 .LBB9_2
+; GFX940-FMA-NEXT: ; %bb.1: ; %bb3
+; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX940-FMA-NEXT: global_store_dword v[0:1], v0, off sc0 sc1
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: .LBB9_2: ; %bb4
+; GFX940-FMA-NEXT: global_load_dword v0, v[0:1], off sc0 sc1
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: s_load_dword s0, s[0:1], 0x48
+; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x42280000
+; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-FMA-NEXT: v_fmac_f32_e64 v1, s0, 0.5
+; GFX940-FMA-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX940-FMA-NEXT: global_store_dword v[0:1], v0, off sc0 sc1
+; GFX940-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX940-FMA-NEXT: s_endpgm
+;
+; GFX10-FMA-LABEL: madak_constant_bus_violation:
+; GFX10-FMA: ; %bb.0: ; %bb
+; GFX10-FMA-NEXT: s_load_dword s2, s[0:1], 0x24
+; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT: s_cmp_lg_u32 s2, 0
+; GFX10-FMA-NEXT: s_cbranch_scc1 .LBB9_2
+; GFX10-FMA-NEXT: ; %bb.1: ; %bb3
+; GFX10-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-FMA-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-FMA-NEXT: .LBB9_2: ; %bb4
+; GFX10-FMA-NEXT: global_load_dword v0, v[0:1], off glc dlc
+; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x48
+; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-FMA-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-FMA-NEXT: v_fmaak_f32 v1, 0.5, v1, 0x42280000
+; GFX10-FMA-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX10-FMA-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-FMA-NEXT: s_endpgm
+;
+; GFX11-FMA-LABEL: madak_constant_bus_violation:
+; GFX11-FMA: ; %bb.0: ; %bb
+; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x24
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FMA-NEXT: s_cbranch_scc1 .LBB9_2
+; GFX11-FMA-NEXT: ; %bb.1: ; %bb3
+; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FMA-NEXT: global_store_b32 v[0:1], v0, off dlc
+; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FMA-NEXT: .LBB9_2: ; %bb4
+; GFX11-FMA-NEXT: global_load_b32 v0, v[0:1], off glc dlc
+; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x48
+; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FMA-NEXT: v_fmaak_f32 v1, 0.5, v1, 0x42280000
+; GFX11-FMA-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX11-FMA-NEXT: global_store_b32 v[0:1], v0, off dlc
+; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FMA-NEXT: s_nop 0
+; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FMA-NEXT: s_endpgm
bb:
%tmp = icmp eq i32 %arg1, 0
br i1 %tmp, label %bb3, label %bb4
>From 00d577c0dd3a1bf65ba4815f920da158bfb38674 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Tue, 14 Nov 2023 14:52:41 +0000
Subject: [PATCH 2/2] [AMDGPU] Allow folding to FMAAK with SGPR and immediate
operand on GFX10+
Allow foldImmediate to create instructions like:
v_fmaak_f32 v0, s0, v0, 0x42000000
This instruction has two "scalar values": s0 and 0x42000000. On GFX10+
this is allowed. This fold was originally implemented before the
compiler supported GFX10, when all ASICs were limited to one scalar
value.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 68 ++++++++++---------
.../test/CodeGen/AMDGPU/code-size-estimate.ll | 12 ++--
llvm/test/CodeGen/AMDGPU/fma.f16.ll | 8 +--
llvm/test/CodeGen/AMDGPU/madak.ll | 24 +++----
4 files changed, 57 insertions(+), 55 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 027b695c3bb1a74..043dfa8ab50116f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3497,43 +3497,45 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// Added part is the constant: Use v_madak_{f16, f32}.
if (Src2->isReg() && Src2->getReg() == Reg) {
- // Not allowed to use constant bus for another operand.
- // We can however allow an inline immediate as src0.
- bool Src0Inlined = false;
- if (Src0->isReg()) {
- // Try to inline constant if possible.
- // If the Def moves immediate and the use is single
- // We are saving VGPR here.
- MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
- if (Def && Def->isMoveImmediate() &&
- isInlineConstant(Def->getOperand(1)) &&
- MRI->hasOneUse(Src0->getReg())) {
- Src0->ChangeToImmediate(Def->getOperand(1).getImm());
- Src0Inlined = true;
- } else if ((Src0->getReg().isPhysical() &&
- (ST.getConstantBusLimit(Opc) <= 1 &&
- RI.isSGPRClass(RI.getPhysRegBaseClass(Src0->getReg())))) ||
- (Src0->getReg().isVirtual() &&
- (ST.getConstantBusLimit(Opc) <= 1 &&
- RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
- return false;
+ if (ST.getConstantBusLimit(Opc) < 2) {
+ // Not allowed to use constant bus for another operand.
+ // We can however allow an inline immediate as src0.
+ bool Src0Inlined = false;
+ if (Src0->isReg()) {
+ // Try to inline constant if possible.
+ // If the Def moves immediate and the use is single
+ // We are saving VGPR here.
+ MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
+ if (Def && Def->isMoveImmediate() &&
+ isInlineConstant(Def->getOperand(1)) &&
+ MRI->hasOneUse(Src0->getReg())) {
+ Src0->ChangeToImmediate(Def->getOperand(1).getImm());
+ Src0Inlined = true;
+ } else if ((Src0->getReg().isPhysical() &&
+ (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRClass(
+ RI.getPhysRegBaseClass(Src0->getReg())))) ||
+ (Src0->getReg().isVirtual() &&
+ (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
+ return false;
// VGPR is okay as Src0 - fallthrough
- }
+ }
- if (Src1->isReg() && !Src0Inlined ) {
- // We have one slot for inlinable constant so far - try to fill it
- MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
- if (Def && Def->isMoveImmediate() &&
- isInlineConstant(Def->getOperand(1)) &&
- MRI->hasOneUse(Src1->getReg()) &&
- commuteInstruction(UseMI)) {
+ if (Src1->isReg() && !Src0Inlined) {
+ // We have one slot for inlinable constant so far - try to fill it
+ MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
+ if (Def && Def->isMoveImmediate() &&
+ isInlineConstant(Def->getOperand(1)) &&
+ MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI)) {
Src0->ChangeToImmediate(Def->getOperand(1).getImm());
- } else if ((Src1->getReg().isPhysical() &&
- RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) ||
- (Src1->getReg().isVirtual() &&
- RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
- return false;
+ } else if ((Src1->getReg().isPhysical() &&
+ RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) ||
+ (Src1->getReg().isVirtual() &&
+ RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+ return false;
// VGPR is okay as Src1 - fallthrough
+ }
}
unsigned NewOpc =
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index 22823c8b6b0a8d5..b1222516786f1c3 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -323,15 +323,15 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
;
; GFX10-LABEL: s_fmaak_f32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: v_mov_b32_e32 v0, 0x43800000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x00,0x80,0x43]
-; GFX10-NEXT: v_fmac_f32_e64 v0, s0, s1 ; encoding: [0x00,0x00,0x2b,0xd5,0x00,0x02,0x00,0x00]
+; GFX10-NEXT: v_mov_b32_e32 v0, s1 ; encoding: [0x01,0x02,0x00,0x7e]
+; GFX10-NEXT: v_fmaak_f32 v0, s0, v0, 0x43800000 ; encoding: [0x00,0x00,0x00,0x5a,0x00,0x00,0x80,0x43]
; GFX10-NEXT: ; return to shader part epilog
;
; GFX1100-LABEL: s_fmaak_f32:
; GFX1100: ; %bb.0:
-; GFX1100-NEXT: v_mov_b32_e32 v0, 0x43800000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x00,0x80,0x43]
+; GFX1100-NEXT: v_mov_b32_e32 v0, s1 ; encoding: [0x01,0x02,0x00,0x7e]
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
-; GFX1100-NEXT: v_fmac_f32_e64 v0, s0, s1 ; encoding: [0x00,0x00,0x2b,0xd5,0x00,0x02,0x00,0x00]
+; GFX1100-NEXT: v_fmaak_f32 v0, s0, v0, 0x43800000 ; encoding: [0x00,0x00,0x00,0x5a,0x00,0x00,0x80,0x43]
; GFX1100-NEXT: ; return to shader part epilog
;
; GFX1150-LABEL: s_fmaak_f32:
@@ -345,6 +345,6 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
}
; GFX9: codeLenInByte = 20
-; GFX10: codeLenInByte = 16
-; GFX1100: codeLenInByte = 20
+; GFX10: codeLenInByte = 12
+; GFX1100: codeLenInByte = 16
; GFX1150: codeLenInByte = 16
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 4ed3abff0ad8515..7894f6bc6797d66 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -146,10 +146,10 @@ define i32 @test_D139469_f16(half %arg) {
; GFX10-SDAG-LABEL: test_D139469_f16:
; GFX10-SDAG: ; %bb.0: ; %bb
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX10-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX10-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX10-SDAG-NEXT: v_min_f16_e32 v0, v2, v1
+; GFX10-SDAG-NEXT: s_movk_i32 s4, 0x291e
+; GFX10-SDAG-NEXT: v_mul_f16_e32 v1, 0x291e, v0
+; GFX10-SDAG-NEXT: v_fmaak_f16 v0, s4, v0, 0x211e
+; GFX10-SDAG-NEXT: v_min_f16_e32 v0, v1, v0
; GFX10-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index a43e23e4aeeada1..310328ddb941856 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -943,10 +943,10 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
; GFX10-MAD-LABEL: s_s_madak_f32:
; GFX10-MAD: ; %bb.0:
; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-MAD-NEXT: v_mov_b32_e32 v0, 0x41200000
; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0
; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT: v_mac_f32_e64 v0, s2, s3
+; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s3
+; GFX10-MAD-NEXT: v_madak_f32 v0, s2, v0, 0x41200000
; GFX10-MAD-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-MAD-NEXT: s_endpgm
;
@@ -976,20 +976,20 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float
; GFX10-FMA-LABEL: s_s_madak_f32:
; GFX10-FMA: ; %bb.0:
; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-FMA-NEXT: v_mov_b32_e32 v0, 0x41200000
; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0
; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT: v_fmac_f32_e64 v0, s2, s3
+; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s3
+; GFX10-FMA-NEXT: v_fmaak_f32 v0, s2, v0, 0x41200000
; GFX10-FMA-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-FMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: s_s_madak_f32:
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FMA-NEXT: v_dual_mov_b32 v0, 0x41200000 :: v_dual_mov_b32 v1, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FMA-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FMA-NEXT: v_fmac_f32_e64 v0, s2, s3
+; GFX11-FMA-NEXT: v_fmaak_f32 v0, s2, v0, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-FMA-NEXT: s_nop 0
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1395,9 +1395,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl
; GFX10-MAD-NEXT: global_load_dword v0, v[0:1], off glc dlc
; GFX10-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX10-MAD-NEXT: s_load_dword s0, s[0:1], 0x48
+; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0.5
; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-MAD-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-MAD-NEXT: v_madak_f32 v1, 0.5, v1, 0x42280000
+; GFX10-MAD-NEXT: v_madak_f32 v1, s0, v1, 0x42280000
; GFX10-MAD-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX10-MAD-NEXT: global_store_dword v[0:1], v0, off
; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1464,9 +1464,9 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl
; GFX10-FMA-NEXT: global_load_dword v0, v[0:1], off glc dlc
; GFX10-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX10-FMA-NEXT: s_load_dword s0, s[0:1], 0x48
+; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0.5
; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FMA-NEXT: v_mov_b32_e32 v1, s0
-; GFX10-FMA-NEXT: v_fmaak_f32 v1, 0.5, v1, 0x42280000
+; GFX10-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x42280000
; GFX10-FMA-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX10-FMA-NEXT: global_store_dword v[0:1], v0, off
; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1486,10 +1486,10 @@ define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], fl
; GFX11-FMA-NEXT: global_load_b32 v0, v[0:1], off glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: s_load_b32 s0, s[0:1], 0x48
+; GFX11-FMA-NEXT: v_mov_b32_e32 v1, 0.5
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FMA-NEXT: v_mov_b32_e32 v1, s0
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FMA-NEXT: v_fmaak_f32 v1, 0.5, v1, 0x42280000
+; GFX11-FMA-NEXT: v_fmaak_f32 v1, s0, v1, 0x42280000
; GFX11-FMA-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX11-FMA-NEXT: global_store_b32 v[0:1], v0, off dlc
; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0
More information about the llvm-commits
mailing list