[llvm] [AMDGPU] Provide control to force VGPR MFMA form (PR #148079)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 14 10:39:25 PDT 2025
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/148079
>From 554f486933c61c617d06b5c0c2e12dd49f75116c Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Thu, 10 Jul 2025 16:08:13 -0700
Subject: [PATCH 1/5] [AMDGPU] Provide control over AGPR/VGPR MFMA form
Change-Id: Ife390264aef869c61a25d032f47301fab5554b4f
---
.../Target/AMDGPU/SIMachineFunctionInfo.cpp | 23 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 1982 +++++++++++++++++
2 files changed, 2004 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 67ad28661da43..a6bd8db1c65f4 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,6 +29,24 @@ enum { MAX_LANES = 64 };
using namespace llvm;
+namespace {
+enum MFMARegClass {
+ Unspecified,
+ VGPR,
+ AGPR,
+};
+}
+
+cl::opt<MFMARegClass>
+ MFMAForm("amdgpu-mfma-form", cl::Hidden,
+ cl::desc("Register class to use for Opc and Dest of MFMA. If "
+ "unspecified, default to compiler heuristics"),
+ cl::init(MFMARegClass::Unspecified),
+ cl::values(clEnumValN(MFMARegClass::VGPR, "vgpr",
+ "Use the VGPR MFMA form."),
+ clEnumValN(MFMARegClass::AGPR, "agpr",
+ "Use the VGPR MFMA form.")));
+
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
const SITargetLowering *TLI = STI->getTargetLowering();
return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
@@ -70,11 +88,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
}
MayNeedAGPRs = ST.hasMAIInsts();
- if (ST.hasGFX90AInsts() &&
+ if (MFMAForm == MFMARegClass::Unspecified && ST.hasGFX90AInsts() &&
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
!mayUseAGPRs(F))
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
+ else if (MFMAForm != MFMARegClass::Unspecified)
+ MayNeedAGPRs = MFMAForm == MFMARegClass::AGPR;
+
if (AMDGPU::isChainCC(CC)) {
// Chain functions don't receive an SP from their caller, but are free to
// set one up. For now, we can use s32 to match what amdgpu_gfx functions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 4628a9c15391b..98d2104f584b0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-form=agpr < %s | FileCheck -enable-var-scope --check-prefixes=AGPR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-form=vgpr < %s | FileCheck -enable-var-scope --check-prefixes=VGPR %s
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
@@ -25,6 +27,28 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; AGPR-LABEL: test_mfma_f32_16x16x32_f16:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT: v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPR-LABEL: test_mfma_f32_16x16x32_f16:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
ret <4 x float> %result
}
@@ -45,6 +69,28 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; AGPR-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT: v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPR-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
ret <4 x float> %result
}
@@ -91,6 +137,46 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; AGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; AGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; AGPR-NEXT: v_mov_b32_e32 v8, 0
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s0
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s1
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s2
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s3
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPR-NEXT: v_mov_b32_e32 v12, 0
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPR-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
store <4 x float> %result, ptr addrspace(1) %out
ret void
@@ -138,6 +224,46 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; AGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; AGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; AGPR-NEXT: v_mov_b32_e32 v8, 0
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s0
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s1
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s2
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s3
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPR-NEXT: v_mov_b32_e32 v12, 0
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPR-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
store <4 x float> %result, ptr addrspace(1) %out
ret void
@@ -271,6 +397,133 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: v_mov_b64_e32 v[12:13], 48
+; AGPR-NEXT: v_mov_b64_e32 v[14:15], 32
+; AGPR-NEXT: v_mov_b64_e32 v[16:17], 16
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT: v_mov_b64_e32 v[18:19], 0
+; AGPR-NEXT: v_mov_b32_e32 v8, s16
+; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT: v_mov_b32_e32 v0, s20
+; AGPR-NEXT: v_mov_b32_e32 v1, s21
+; AGPR-NEXT: v_mov_b32_e32 v2, s22
+; AGPR-NEXT: v_mov_b32_e32 v3, s23
+; AGPR-NEXT: v_mov_b32_e32 v9, s17
+; AGPR-NEXT: v_mov_b32_e32 v10, s18
+; AGPR-NEXT: v_mov_b32_e32 v11, s19
+; AGPR-NEXT: s_nop 4
+; AGPR-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s8
+; AGPR-NEXT: v_mov_b32_e32 v1, s9
+; AGPR-NEXT: v_mov_b32_e32 v2, s10
+; AGPR-NEXT: v_mov_b32_e32 v3, s11
+; AGPR-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s12
+; AGPR-NEXT: v_mov_b32_e32 v1, s13
+; AGPR-NEXT: v_mov_b32_e32 v2, s14
+; AGPR-NEXT: v_mov_b32_e32 v3, s15
+; AGPR-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: v_mov_b64_e32 v[44:45], 48
+; VGPR-NEXT: v_mov_b64_e32 v[46:47], 32
+; VGPR-NEXT: v_mov_b64_e32 v[48:49], 16
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: v_mov_b64_e32 v[50:51], 0
+; VGPR-NEXT: v_mov_b32_e32 v40, s16
+; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPR-NEXT: v_mov_b32_e32 v41, s17
+; VGPR-NEXT: v_mov_b32_e32 v42, s18
+; VGPR-NEXT: v_mov_b32_e32 v43, s19
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v0, s20
+; VGPR-NEXT: v_mov_b32_e32 v1, s21
+; VGPR-NEXT: v_mov_b32_e32 v2, s22
+; VGPR-NEXT: v_mov_b32_e32 v3, s23
+; VGPR-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s8
+; VGPR-NEXT: v_mov_b32_e32 v1, s9
+; VGPR-NEXT: v_mov_b32_e32 v2, s10
+; VGPR-NEXT: v_mov_b32_e32 v3, s11
+; VGPR-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s12
+; VGPR-NEXT: v_mov_b32_e32 v1, s13
+; VGPR-NEXT: v_mov_b32_e32 v2, s14
+; VGPR-NEXT: v_mov_b32_e32 v3, s15
+; VGPR-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
store volatile <16 x float> %result, ptr addrspace(1) null
store volatile <16 x float> %arg2, ptr addrspace(1) null
@@ -401,6 +654,133 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: v_mov_b64_e32 v[12:13], 48
+; AGPR-NEXT: v_mov_b64_e32 v[14:15], 32
+; AGPR-NEXT: v_mov_b64_e32 v[16:17], 16
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT: v_mov_b64_e32 v[18:19], 0
+; AGPR-NEXT: v_mov_b32_e32 v8, s16
+; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; AGPR-NEXT: v_mov_b32_e32 v0, s20
+; AGPR-NEXT: v_mov_b32_e32 v1, s21
+; AGPR-NEXT: v_mov_b32_e32 v2, s22
+; AGPR-NEXT: v_mov_b32_e32 v3, s23
+; AGPR-NEXT: v_mov_b32_e32 v9, s17
+; AGPR-NEXT: v_mov_b32_e32 v10, s18
+; AGPR-NEXT: v_mov_b32_e32 v11, s19
+; AGPR-NEXT: s_nop 4
+; AGPR-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s8
+; AGPR-NEXT: v_mov_b32_e32 v1, s9
+; AGPR-NEXT: v_mov_b32_e32 v2, s10
+; AGPR-NEXT: v_mov_b32_e32 v3, s11
+; AGPR-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s12
+; AGPR-NEXT: v_mov_b32_e32 v1, s13
+; AGPR-NEXT: v_mov_b32_e32 v2, s14
+; AGPR-NEXT: v_mov_b32_e32 v3, s15
+; AGPR-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: v_mov_b64_e32 v[44:45], 48
+; VGPR-NEXT: v_mov_b64_e32 v[46:47], 32
+; VGPR-NEXT: v_mov_b64_e32 v[48:49], 16
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: v_mov_b64_e32 v[50:51], 0
+; VGPR-NEXT: v_mov_b32_e32 v40, s16
+; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPR-NEXT: v_mov_b32_e32 v41, s17
+; VGPR-NEXT: v_mov_b32_e32 v42, s18
+; VGPR-NEXT: v_mov_b32_e32 v43, s19
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v0, s20
+; VGPR-NEXT: v_mov_b32_e32 v1, s21
+; VGPR-NEXT: v_mov_b32_e32 v2, s22
+; VGPR-NEXT: v_mov_b32_e32 v3, s23
+; VGPR-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s8
+; VGPR-NEXT: v_mov_b32_e32 v1, s9
+; VGPR-NEXT: v_mov_b32_e32 v2, s10
+; VGPR-NEXT: v_mov_b32_e32 v3, s11
+; VGPR-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s12
+; VGPR-NEXT: v_mov_b32_e32 v1, s13
+; VGPR-NEXT: v_mov_b32_e32 v2, s14
+; VGPR-NEXT: v_mov_b32_e32 v3, s15
+; VGPR-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
store volatile <16 x float> %result, ptr addrspace(1) null
store volatile <16 x float> %arg2, ptr addrspace(1) null
@@ -448,6 +828,71 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
; GCN-NEXT: v_accvgpr_read_b32 v14, a14
; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, v12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, v13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, v14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, v15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, v16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, v17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, v18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, v19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, v20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, v21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, v22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, v23
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: s_nop 3
+; AGPR-NEXT: v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT: v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT: v_accvgpr_read_b32 v4, a4
+; AGPR-NEXT: v_accvgpr_read_b32 v5, a5
+; AGPR-NEXT: v_accvgpr_read_b32 v6, a6
+; AGPR-NEXT: v_accvgpr_read_b32 v7, a7
+; AGPR-NEXT: v_accvgpr_read_b32 v8, a8
+; AGPR-NEXT: v_accvgpr_read_b32 v9, a9
+; AGPR-NEXT: v_accvgpr_read_b32 v10, a10
+; AGPR-NEXT: v_accvgpr_read_b32 v11, a11
+; AGPR-NEXT: v_accvgpr_read_b32 v12, a12
+; AGPR-NEXT: v_accvgpr_read_b32 v13, a13
+; AGPR-NEXT: v_accvgpr_read_b32 v14, a14
+; AGPR-NEXT: v_accvgpr_read_b32 v15, a15
+; AGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23]
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 3
+; VGPR-NEXT: v_mov_b32_e32 v0, v8
+; VGPR-NEXT: v_mov_b32_e32 v1, v9
+; VGPR-NEXT: v_mov_b32_e32 v2, v10
+; VGPR-NEXT: v_mov_b32_e32 v3, v11
+; VGPR-NEXT: v_mov_b32_e32 v4, v12
+; VGPR-NEXT: v_mov_b32_e32 v5, v13
+; VGPR-NEXT: v_mov_b32_e32 v6, v14
+; VGPR-NEXT: v_mov_b32_e32 v7, v15
+; VGPR-NEXT: v_mov_b32_e32 v8, v16
+; VGPR-NEXT: v_mov_b32_e32 v9, v17
+; VGPR-NEXT: v_mov_b32_e32 v10, v18
+; VGPR-NEXT: v_mov_b32_e32 v11, v19
+; VGPR-NEXT: v_mov_b32_e32 v12, v20
+; VGPR-NEXT: v_mov_b32_e32 v13, v21
+; VGPR-NEXT: v_mov_b32_e32 v14, v22
+; VGPR-NEXT: v_mov_b32_e32 v15, v23
+; VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
@@ -493,6 +938,71 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
; GCN-NEXT: v_accvgpr_read_b32 v14, a14
; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, v12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, v13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, v14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, v15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, v16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, v17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, v18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, v19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, v20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, v21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, v22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, v23
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: s_nop 3
+; AGPR-NEXT: v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT: v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT: v_accvgpr_read_b32 v4, a4
+; AGPR-NEXT: v_accvgpr_read_b32 v5, a5
+; AGPR-NEXT: v_accvgpr_read_b32 v6, a6
+; AGPR-NEXT: v_accvgpr_read_b32 v7, a7
+; AGPR-NEXT: v_accvgpr_read_b32 v8, a8
+; AGPR-NEXT: v_accvgpr_read_b32 v9, a9
+; AGPR-NEXT: v_accvgpr_read_b32 v10, a10
+; AGPR-NEXT: v_accvgpr_read_b32 v11, a11
+; AGPR-NEXT: v_accvgpr_read_b32 v12, a12
+; AGPR-NEXT: v_accvgpr_read_b32 v13, a13
+; AGPR-NEXT: v_accvgpr_read_b32 v14, a14
+; AGPR-NEXT: v_accvgpr_read_b32 v15, a15
+; AGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 3
+; VGPR-NEXT: v_mov_b32_e32 v0, v8
+; VGPR-NEXT: v_mov_b32_e32 v1, v9
+; VGPR-NEXT: v_mov_b32_e32 v2, v10
+; VGPR-NEXT: v_mov_b32_e32 v3, v11
+; VGPR-NEXT: v_mov_b32_e32 v4, v12
+; VGPR-NEXT: v_mov_b32_e32 v5, v13
+; VGPR-NEXT: v_mov_b32_e32 v6, v14
+; VGPR-NEXT: v_mov_b32_e32 v7, v15
+; VGPR-NEXT: v_mov_b32_e32 v8, v16
+; VGPR-NEXT: v_mov_b32_e32 v9, v17
+; VGPR-NEXT: v_mov_b32_e32 v10, v18
+; VGPR-NEXT: v_mov_b32_e32 v11, v19
+; VGPR-NEXT: v_mov_b32_e32 v12, v20
+; VGPR-NEXT: v_mov_b32_e32 v13, v21
+; VGPR-NEXT: v_mov_b32_e32 v14, v22
+; VGPR-NEXT: v_mov_b32_e32 v15, v23
+; VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1)
ret <16 x float> %result
}
@@ -615,6 +1125,127 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: v_mov_b32_e32 v12, 0
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT: v_accvgpr_write_b32 a31, s23
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT: v_accvgpr_write_b32 a30, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a29, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a28, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a27, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a26, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a25, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a24, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a23, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a22, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a21, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a20, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a19, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a18, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a17, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a16, s8
+; AGPR-NEXT: v_mov_b32_e32 v8, s20
+; AGPR-NEXT: v_mov_b32_e32 v9, s21
+; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
+; AGPR-NEXT: v_mov_b32_e32 v10, s22
+; AGPR-NEXT: v_mov_b32_e32 v11, s23
+; AGPR-NEXT: v_mov_b32_e32 v0, s16
+; AGPR-NEXT: v_mov_b32_e32 v1, s17
+; AGPR-NEXT: v_mov_b32_e32 v2, s18
+; AGPR-NEXT: v_mov_b32_e32 v3, s19
+; AGPR-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s12
+; AGPR-NEXT: v_mov_b32_e32 v1, s13
+; AGPR-NEXT: v_mov_b32_e32 v2, s14
+; AGPR-NEXT: v_mov_b32_e32 v3, s15
+; AGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s8
+; AGPR-NEXT: v_mov_b32_e32 v1, s9
+; AGPR-NEXT: v_mov_b32_e32 v2, s10
+; AGPR-NEXT: v_mov_b32_e32 v3, s11
+; AGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: v_mov_b32_e32 v44, 0
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; VGPR-NEXT: v_mov_b32_e32 v40, s20
+; VGPR-NEXT: v_mov_b32_e32 v41, s21
+; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; VGPR-NEXT: v_mov_b32_e32 v42, s22
+; VGPR-NEXT: v_mov_b32_e32 v43, s23
+; VGPR-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 2
+; VGPR-NEXT: v_mov_b32_e32 v16, s16
+; VGPR-NEXT: v_mov_b32_e32 v17, s17
+; VGPR-NEXT: v_mov_b32_e32 v18, s18
+; VGPR-NEXT: v_mov_b32_e32 v19, s19
+; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s12
+; VGPR-NEXT: v_mov_b32_e32 v17, s13
+; VGPR-NEXT: v_mov_b32_e32 v18, s14
+; VGPR-NEXT: v_mov_b32_e32 v19, s15
+; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s8
+; VGPR-NEXT: v_mov_b32_e32 v17, s9
+; VGPR-NEXT: v_mov_b32_e32 v18, s10
+; VGPR-NEXT: v_mov_b32_e32 v19, s11
+; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
store volatile <16 x float> %arg2, ptr addrspace(1) %out
store volatile <16 x float> %result, ptr addrspace(1) %out
@@ -739,6 +1370,127 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: v_mov_b32_e32 v12, 0
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT: v_accvgpr_write_b32 a31, s23
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT: v_accvgpr_write_b32 a30, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a29, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a28, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a27, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a26, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a25, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a24, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a23, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a22, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a21, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a20, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a19, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a18, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a17, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a16, s8
+; AGPR-NEXT: v_mov_b32_e32 v8, s20
+; AGPR-NEXT: v_mov_b32_e32 v9, s21
+; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
+; AGPR-NEXT: v_mov_b32_e32 v10, s22
+; AGPR-NEXT: v_mov_b32_e32 v11, s23
+; AGPR-NEXT: v_mov_b32_e32 v0, s16
+; AGPR-NEXT: v_mov_b32_e32 v1, s17
+; AGPR-NEXT: v_mov_b32_e32 v2, s18
+; AGPR-NEXT: v_mov_b32_e32 v3, s19
+; AGPR-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s12
+; AGPR-NEXT: v_mov_b32_e32 v1, s13
+; AGPR-NEXT: v_mov_b32_e32 v2, s14
+; AGPR-NEXT: v_mov_b32_e32 v3, s15
+; AGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s8
+; AGPR-NEXT: v_mov_b32_e32 v1, s9
+; AGPR-NEXT: v_mov_b32_e32 v2, s10
+; AGPR-NEXT: v_mov_b32_e32 v3, s11
+; AGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: v_mov_b32_e32 v44, 0
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; VGPR-NEXT: v_mov_b32_e32 v40, s20
+; VGPR-NEXT: v_mov_b32_e32 v41, s21
+; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPR-NEXT: v_mov_b32_e32 v42, s22
+; VGPR-NEXT: v_mov_b32_e32 v43, s23
+; VGPR-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 2
+; VGPR-NEXT: v_mov_b32_e32 v16, s16
+; VGPR-NEXT: v_mov_b32_e32 v17, s17
+; VGPR-NEXT: v_mov_b32_e32 v18, s18
+; VGPR-NEXT: v_mov_b32_e32 v19, s19
+; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s12
+; VGPR-NEXT: v_mov_b32_e32 v17, s13
+; VGPR-NEXT: v_mov_b32_e32 v18, s14
+; VGPR-NEXT: v_mov_b32_e32 v19, s15
+; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s8
+; VGPR-NEXT: v_mov_b32_e32 v17, s9
+; VGPR-NEXT: v_mov_b32_e32 v18, s10
+; VGPR-NEXT: v_mov_b32_e32 v19, s11
+; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
store volatile <16 x float> %arg2, ptr addrspace(1) %out
store volatile <16 x float> %result, ptr addrspace(1) %out
@@ -819,6 +1571,72 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT: v_mov_b32_e32 v0, 0
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: s_nop 2
+; AGPR-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; AGPR-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; AGPR-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; AGPR-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; VGPR-NEXT: v_mov_b32_e32 v16, 0
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 2
+; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
store <16 x float> %result, ptr addrspace(1) %out
ret void
@@ -898,6 +1716,72 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
+; AGPR-NEXT: v_mov_b32_e32 v0, 0
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: s_nop 2
+; AGPR-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; AGPR-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; AGPR-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; AGPR-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; VGPR-NEXT: v_mov_b32_e32 v16, 0
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 2
+; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
store <16 x float> %result, ptr addrspace(1) %out
ret void
@@ -925,6 +1809,28 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; AGPR-LABEL: test_mfma_i32_16x16x64_i8:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT: v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPR-LABEL: test_mfma_i32_16x16x64_i8:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0)
ret <4 x i32> %result
}
@@ -945,6 +1851,28 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; AGPR-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT: v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPR-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 1, i32 1, i32 1)
ret <4 x i32> %result
}
@@ -995,6 +1923,56 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; AGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; AGPR-NEXT: v_mov_b32_e32 v8, 0
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b32_e32 v0, s8
+; AGPR-NEXT: v_mov_b32_e32 v1, s9
+; AGPR-NEXT: v_mov_b32_e32 v2, s10
+; AGPR-NEXT: v_mov_b32_e32 v3, s11
+; AGPR-NEXT: v_mov_b32_e32 v4, s12
+; AGPR-NEXT: v_mov_b32_e32 v5, s13
+; AGPR-NEXT: v_mov_b32_e32 v6, s14
+; AGPR-NEXT: v_mov_b32_e32 v7, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s0
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s1
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s2
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s3
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPR-NEXT: v_mov_b32_e32 v12, 0
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v0, s8
+; VGPR-NEXT: v_mov_b32_e32 v1, s9
+; VGPR-NEXT: v_mov_b32_e32 v2, s10
+; VGPR-NEXT: v_mov_b32_e32 v3, s11
+; VGPR-NEXT: v_mov_b32_e32 v4, s12
+; VGPR-NEXT: v_mov_b32_e32 v5, s13
+; VGPR-NEXT: v_mov_b32_e32 v6, s14
+; VGPR-NEXT: v_mov_b32_e32 v7, s15
+; VGPR-NEXT: v_mov_b32_e32 v8, s0
+; VGPR-NEXT: v_mov_b32_e32 v9, s1
+; VGPR-NEXT: v_mov_b32_e32 v10, s2
+; VGPR-NEXT: v_mov_b32_e32 v11, s3
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPR-NEXT: s_endpgm
%result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0)
store <4 x i32> %result, ptr addrspace(1) %out
ret void
@@ -1046,6 +2024,56 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; AGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; AGPR-NEXT: v_mov_b32_e32 v8, 0
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b32_e32 v0, s8
+; AGPR-NEXT: v_mov_b32_e32 v1, s9
+; AGPR-NEXT: v_mov_b32_e32 v2, s10
+; AGPR-NEXT: v_mov_b32_e32 v3, s11
+; AGPR-NEXT: v_mov_b32_e32 v4, s12
+; AGPR-NEXT: v_mov_b32_e32 v5, s13
+; AGPR-NEXT: v_mov_b32_e32 v6, s14
+; AGPR-NEXT: v_mov_b32_e32 v7, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s0
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s1
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s2
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s3
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPR-NEXT: v_mov_b32_e32 v12, 0
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v0, s8
+; VGPR-NEXT: v_mov_b32_e32 v1, s9
+; VGPR-NEXT: v_mov_b32_e32 v2, s10
+; VGPR-NEXT: v_mov_b32_e32 v3, s11
+; VGPR-NEXT: v_mov_b32_e32 v4, s12
+; VGPR-NEXT: v_mov_b32_e32 v5, s13
+; VGPR-NEXT: v_mov_b32_e32 v6, s14
+; VGPR-NEXT: v_mov_b32_e32 v7, s15
+; VGPR-NEXT: v_mov_b32_e32 v8, s0
+; VGPR-NEXT: v_mov_b32_e32 v9, s1
+; VGPR-NEXT: v_mov_b32_e32 v10, s2
+; VGPR-NEXT: v_mov_b32_e32 v11, s3
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPR-NEXT: s_endpgm
%result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 3, i32 2, i32 1)
store <4 x i32> %result, ptr addrspace(1) %out
ret void
@@ -1187,6 +2215,145 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: v_mov_b64_e32 v[8:9], 48
+; AGPR-NEXT: v_mov_b64_e32 v[10:11], 32
+; AGPR-NEXT: v_mov_b64_e32 v[12:13], 16
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b32_e32 v0, s24
+; AGPR-NEXT: v_mov_b32_e32 v1, s25
+; AGPR-NEXT: v_mov_b32_e32 v2, s26
+; AGPR-NEXT: v_mov_b32_e32 v3, s27
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT: v_mov_b32_e32 v4, s28
+; AGPR-NEXT: v_mov_b32_e32 v5, s29
+; AGPR-NEXT: v_mov_b32_e32 v6, s30
+; AGPR-NEXT: v_mov_b32_e32 v7, s31
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT: v_mov_b64_e32 v[14:15], 0
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT: v_mov_b32_e32 v0, s16
+; AGPR-NEXT: v_mov_b32_e32 v1, s17
+; AGPR-NEXT: v_mov_b32_e32 v2, s18
+; AGPR-NEXT: v_mov_b32_e32 v3, s19
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s20
+; AGPR-NEXT: v_mov_b32_e32 v1, s21
+; AGPR-NEXT: v_mov_b32_e32 v2, s22
+; AGPR-NEXT: v_mov_b32_e32 v3, s23
+; AGPR-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s8
+; AGPR-NEXT: v_mov_b32_e32 v1, s9
+; AGPR-NEXT: v_mov_b32_e32 v2, s10
+; AGPR-NEXT: v_mov_b32_e32 v3, s11
+; AGPR-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s12
+; AGPR-NEXT: v_mov_b32_e32 v1, s13
+; AGPR-NEXT: v_mov_b32_e32 v2, s14
+; AGPR-NEXT: v_mov_b32_e32 v3, s15
+; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPR-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPR-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v32, s24
+; VGPR-NEXT: v_mov_b32_e32 v33, s25
+; VGPR-NEXT: v_mov_b32_e32 v34, s26
+; VGPR-NEXT: v_mov_b32_e32 v35, s27
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b32_e32 v36, s28
+; VGPR-NEXT: v_mov_b32_e32 v37, s29
+; VGPR-NEXT: v_mov_b32_e32 v38, s30
+; VGPR-NEXT: v_mov_b32_e32 v39, s31
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 3
+; VGPR-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v0, s16
+; VGPR-NEXT: v_mov_b32_e32 v1, s17
+; VGPR-NEXT: v_mov_b32_e32 v2, s18
+; VGPR-NEXT: v_mov_b32_e32 v3, s19
+; VGPR-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s20
+; VGPR-NEXT: v_mov_b32_e32 v1, s21
+; VGPR-NEXT: v_mov_b32_e32 v2, s22
+; VGPR-NEXT: v_mov_b32_e32 v3, s23
+; VGPR-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s8
+; VGPR-NEXT: v_mov_b32_e32 v1, s9
+; VGPR-NEXT: v_mov_b32_e32 v2, s10
+; VGPR-NEXT: v_mov_b32_e32 v3, s11
+; VGPR-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s12
+; VGPR-NEXT: v_mov_b32_e32 v1, s13
+; VGPR-NEXT: v_mov_b32_e32 v2, s14
+; VGPR-NEXT: v_mov_b32_e32 v3, s15
+; VGPR-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_endpgm
%result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
store volatile <16 x i32> %result, ptr addrspace(1) null
store volatile <16 x i32> %arg2, ptr addrspace(1) null
@@ -1323,6 +2490,145 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: v_mov_b64_e32 v[8:9], 48
+; AGPR-NEXT: v_mov_b64_e32 v[10:11], 32
+; AGPR-NEXT: v_mov_b64_e32 v[12:13], 16
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b32_e32 v0, s24
+; AGPR-NEXT: v_mov_b32_e32 v1, s25
+; AGPR-NEXT: v_mov_b32_e32 v2, s26
+; AGPR-NEXT: v_mov_b32_e32 v3, s27
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT: v_mov_b32_e32 v4, s28
+; AGPR-NEXT: v_mov_b32_e32 v5, s29
+; AGPR-NEXT: v_mov_b32_e32 v6, s30
+; AGPR-NEXT: v_mov_b32_e32 v7, s31
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT: v_mov_b64_e32 v[14:15], 0
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; AGPR-NEXT: v_mov_b32_e32 v0, s16
+; AGPR-NEXT: v_mov_b32_e32 v1, s17
+; AGPR-NEXT: v_mov_b32_e32 v2, s18
+; AGPR-NEXT: v_mov_b32_e32 v3, s19
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s20
+; AGPR-NEXT: v_mov_b32_e32 v1, s21
+; AGPR-NEXT: v_mov_b32_e32 v2, s22
+; AGPR-NEXT: v_mov_b32_e32 v3, s23
+; AGPR-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s8
+; AGPR-NEXT: v_mov_b32_e32 v1, s9
+; AGPR-NEXT: v_mov_b32_e32 v2, s10
+; AGPR-NEXT: v_mov_b32_e32 v3, s11
+; AGPR-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s12
+; AGPR-NEXT: v_mov_b32_e32 v1, s13
+; AGPR-NEXT: v_mov_b32_e32 v2, s14
+; AGPR-NEXT: v_mov_b32_e32 v3, s15
+; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPR-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPR-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v32, s24
+; VGPR-NEXT: v_mov_b32_e32 v33, s25
+; VGPR-NEXT: v_mov_b32_e32 v34, s26
+; VGPR-NEXT: v_mov_b32_e32 v35, s27
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b32_e32 v36, s28
+; VGPR-NEXT: v_mov_b32_e32 v37, s29
+; VGPR-NEXT: v_mov_b32_e32 v38, s30
+; VGPR-NEXT: v_mov_b32_e32 v39, s31
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 3
+; VGPR-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v0, s16
+; VGPR-NEXT: v_mov_b32_e32 v1, s17
+; VGPR-NEXT: v_mov_b32_e32 v2, s18
+; VGPR-NEXT: v_mov_b32_e32 v3, s19
+; VGPR-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s20
+; VGPR-NEXT: v_mov_b32_e32 v1, s21
+; VGPR-NEXT: v_mov_b32_e32 v2, s22
+; VGPR-NEXT: v_mov_b32_e32 v3, s23
+; VGPR-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s8
+; VGPR-NEXT: v_mov_b32_e32 v1, s9
+; VGPR-NEXT: v_mov_b32_e32 v2, s10
+; VGPR-NEXT: v_mov_b32_e32 v3, s11
+; VGPR-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s12
+; VGPR-NEXT: v_mov_b32_e32 v1, s13
+; VGPR-NEXT: v_mov_b32_e32 v2, s14
+; VGPR-NEXT: v_mov_b32_e32 v3, s15
+; VGPR-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_endpgm
%result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 2, i32 3, i32 1)
store volatile <16 x i32> %result, ptr addrspace(1) null
store volatile <16 x i32> %arg2, ptr addrspace(1) null
@@ -1370,6 +2676,71 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
; GCN-NEXT: v_accvgpr_read_b32 v14, a14
; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, v12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, v13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, v14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, v15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, v16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, v17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, v18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, v19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, v20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, v21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, v22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, v23
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: s_nop 3
+; AGPR-NEXT: v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT: v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT: v_accvgpr_read_b32 v4, a4
+; AGPR-NEXT: v_accvgpr_read_b32 v5, a5
+; AGPR-NEXT: v_accvgpr_read_b32 v6, a6
+; AGPR-NEXT: v_accvgpr_read_b32 v7, a7
+; AGPR-NEXT: v_accvgpr_read_b32 v8, a8
+; AGPR-NEXT: v_accvgpr_read_b32 v9, a9
+; AGPR-NEXT: v_accvgpr_read_b32 v10, a10
+; AGPR-NEXT: v_accvgpr_read_b32 v11, a11
+; AGPR-NEXT: v_accvgpr_read_b32 v12, a12
+; AGPR-NEXT: v_accvgpr_read_b32 v13, a13
+; AGPR-NEXT: v_accvgpr_read_b32 v14, a14
+; AGPR-NEXT: v_accvgpr_read_b32 v15, a15
+; AGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23]
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 3
+; VGPR-NEXT: v_mov_b32_e32 v0, v8
+; VGPR-NEXT: v_mov_b32_e32 v1, v9
+; VGPR-NEXT: v_mov_b32_e32 v2, v10
+; VGPR-NEXT: v_mov_b32_e32 v3, v11
+; VGPR-NEXT: v_mov_b32_e32 v4, v12
+; VGPR-NEXT: v_mov_b32_e32 v5, v13
+; VGPR-NEXT: v_mov_b32_e32 v6, v14
+; VGPR-NEXT: v_mov_b32_e32 v7, v15
+; VGPR-NEXT: v_mov_b32_e32 v8, v16
+; VGPR-NEXT: v_mov_b32_e32 v9, v17
+; VGPR-NEXT: v_mov_b32_e32 v10, v18
+; VGPR-NEXT: v_mov_b32_e32 v11, v19
+; VGPR-NEXT: v_mov_b32_e32 v12, v20
+; VGPR-NEXT: v_mov_b32_e32 v13, v21
+; VGPR-NEXT: v_mov_b32_e32 v14, v22
+; VGPR-NEXT: v_mov_b32_e32 v15, v23
+; VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
ret <16 x i32> %result
}
@@ -1415,6 +2786,71 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
; GCN-NEXT: v_accvgpr_read_b32 v14, a14
; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, v12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, v13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, v14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, v15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, v16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, v17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, v18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, v19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, v20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, v21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, v22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, v23
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: s_nop 3
+; AGPR-NEXT: v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT: v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT: v_accvgpr_read_b32 v4, a4
+; AGPR-NEXT: v_accvgpr_read_b32 v5, a5
+; AGPR-NEXT: v_accvgpr_read_b32 v6, a6
+; AGPR-NEXT: v_accvgpr_read_b32 v7, a7
+; AGPR-NEXT: v_accvgpr_read_b32 v8, a8
+; AGPR-NEXT: v_accvgpr_read_b32 v9, a9
+; AGPR-NEXT: v_accvgpr_read_b32 v10, a10
+; AGPR-NEXT: v_accvgpr_read_b32 v11, a11
+; AGPR-NEXT: v_accvgpr_read_b32 v12, a12
+; AGPR-NEXT: v_accvgpr_read_b32 v13, a13
+; AGPR-NEXT: v_accvgpr_read_b32 v14, a14
+; AGPR-NEXT: v_accvgpr_read_b32 v15, a15
+; AGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 3
+; VGPR-NEXT: v_mov_b32_e32 v0, v8
+; VGPR-NEXT: v_mov_b32_e32 v1, v9
+; VGPR-NEXT: v_mov_b32_e32 v2, v10
+; VGPR-NEXT: v_mov_b32_e32 v3, v11
+; VGPR-NEXT: v_mov_b32_e32 v4, v12
+; VGPR-NEXT: v_mov_b32_e32 v5, v13
+; VGPR-NEXT: v_mov_b32_e32 v6, v14
+; VGPR-NEXT: v_mov_b32_e32 v7, v15
+; VGPR-NEXT: v_mov_b32_e32 v8, v16
+; VGPR-NEXT: v_mov_b32_e32 v9, v17
+; VGPR-NEXT: v_mov_b32_e32 v10, v18
+; VGPR-NEXT: v_mov_b32_e32 v11, v19
+; VGPR-NEXT: v_mov_b32_e32 v12, v20
+; VGPR-NEXT: v_mov_b32_e32 v13, v21
+; VGPR-NEXT: v_mov_b32_e32 v14, v22
+; VGPR-NEXT: v_mov_b32_e32 v15, v23
+; VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 1, i32 1)
ret <16 x i32> %result
}
@@ -1544,6 +2980,141 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: v_mov_b32_e32 v8, 0
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b32_e32 v0, s20
+; AGPR-NEXT: v_mov_b32_e32 v1, s21
+; AGPR-NEXT: v_mov_b32_e32 v2, s22
+; AGPR-NEXT: v_mov_b32_e32 v3, s23
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: v_mov_b32_e32 v4, s24
+; AGPR-NEXT: v_mov_b32_e32 v5, s25
+; AGPR-NEXT: v_mov_b32_e32 v6, s26
+; AGPR-NEXT: v_mov_b32_e32 v7, s27
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a31, s23
+; AGPR-NEXT: v_accvgpr_write_b32 a30, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a29, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a28, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a27, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a26, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a25, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a24, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a23, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a22, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a21, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a20, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a19, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a18, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a17, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a16, s8
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
+; AGPR-NEXT: v_mov_b32_e32 v0, s20
+; AGPR-NEXT: v_mov_b32_e32 v1, s21
+; AGPR-NEXT: v_mov_b32_e32 v2, s22
+; AGPR-NEXT: v_mov_b32_e32 v3, s23
+; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s16
+; AGPR-NEXT: v_mov_b32_e32 v1, s17
+; AGPR-NEXT: v_mov_b32_e32 v2, s18
+; AGPR-NEXT: v_mov_b32_e32 v3, s19
+; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s12
+; AGPR-NEXT: v_mov_b32_e32 v1, s13
+; AGPR-NEXT: v_mov_b32_e32 v2, s14
+; AGPR-NEXT: v_mov_b32_e32 v3, s15
+; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s8
+; AGPR-NEXT: v_mov_b32_e32 v1, s9
+; AGPR-NEXT: v_mov_b32_e32 v2, s10
+; AGPR-NEXT: v_mov_b32_e32 v3, s11
+; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: v_mov_b32_e32 v40, 0
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v32, s20
+; VGPR-NEXT: v_mov_b32_e32 v33, s21
+; VGPR-NEXT: v_mov_b32_e32 v34, s22
+; VGPR-NEXT: v_mov_b32_e32 v35, s23
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: v_mov_b32_e32 v36, s24
+; VGPR-NEXT: v_mov_b32_e32 v37, s25
+; VGPR-NEXT: v_mov_b32_e32 v38, s26
+; VGPR-NEXT: v_mov_b32_e32 v39, s27
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; VGPR-NEXT: s_nop 6
+; VGPR-NEXT: v_mov_b32_e32 v16, s20
+; VGPR-NEXT: v_mov_b32_e32 v17, s21
+; VGPR-NEXT: v_mov_b32_e32 v18, s22
+; VGPR-NEXT: v_mov_b32_e32 v19, s23
+; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s16
+; VGPR-NEXT: v_mov_b32_e32 v17, s17
+; VGPR-NEXT: v_mov_b32_e32 v18, s18
+; VGPR-NEXT: v_mov_b32_e32 v19, s19
+; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s12
+; VGPR-NEXT: v_mov_b32_e32 v17, s13
+; VGPR-NEXT: v_mov_b32_e32 v18, s14
+; VGPR-NEXT: v_mov_b32_e32 v19, s15
+; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s8
+; VGPR-NEXT: v_mov_b32_e32 v17, s9
+; VGPR-NEXT: v_mov_b32_e32 v18, s10
+; VGPR-NEXT: v_mov_b32_e32 v19, s11
+; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_endpgm
%result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
store volatile <16 x i32> %arg2, ptr addrspace(1) %out
store volatile <16 x i32> %result, ptr addrspace(1) %out
@@ -1675,6 +3246,141 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: v_mov_b32_e32 v8, 0
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b32_e32 v0, s20
+; AGPR-NEXT: v_mov_b32_e32 v1, s21
+; AGPR-NEXT: v_mov_b32_e32 v2, s22
+; AGPR-NEXT: v_mov_b32_e32 v3, s23
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: v_mov_b32_e32 v4, s24
+; AGPR-NEXT: v_mov_b32_e32 v5, s25
+; AGPR-NEXT: v_mov_b32_e32 v6, s26
+; AGPR-NEXT: v_mov_b32_e32 v7, s27
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a31, s23
+; AGPR-NEXT: v_accvgpr_write_b32 a30, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a29, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a28, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a27, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a26, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a25, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a24, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a23, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a22, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a21, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a20, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a19, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a18, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a17, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a16, s8
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
+; AGPR-NEXT: v_mov_b32_e32 v0, s20
+; AGPR-NEXT: v_mov_b32_e32 v1, s21
+; AGPR-NEXT: v_mov_b32_e32 v2, s22
+; AGPR-NEXT: v_mov_b32_e32 v3, s23
+; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s16
+; AGPR-NEXT: v_mov_b32_e32 v1, s17
+; AGPR-NEXT: v_mov_b32_e32 v2, s18
+; AGPR-NEXT: v_mov_b32_e32 v3, s19
+; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s12
+; AGPR-NEXT: v_mov_b32_e32 v1, s13
+; AGPR-NEXT: v_mov_b32_e32 v2, s14
+; AGPR-NEXT: v_mov_b32_e32 v3, s15
+; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s8
+; AGPR-NEXT: v_mov_b32_e32 v1, s9
+; AGPR-NEXT: v_mov_b32_e32 v2, s10
+; AGPR-NEXT: v_mov_b32_e32 v3, s11
+; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: v_mov_b32_e32 v40, 0
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v32, s20
+; VGPR-NEXT: v_mov_b32_e32 v33, s21
+; VGPR-NEXT: v_mov_b32_e32 v34, s22
+; VGPR-NEXT: v_mov_b32_e32 v35, s23
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: v_mov_b32_e32 v36, s24
+; VGPR-NEXT: v_mov_b32_e32 v37, s25
+; VGPR-NEXT: v_mov_b32_e32 v38, s26
+; VGPR-NEXT: v_mov_b32_e32 v39, s27
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPR-NEXT: s_nop 6
+; VGPR-NEXT: v_mov_b32_e32 v16, s20
+; VGPR-NEXT: v_mov_b32_e32 v17, s21
+; VGPR-NEXT: v_mov_b32_e32 v18, s22
+; VGPR-NEXT: v_mov_b32_e32 v19, s23
+; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s16
+; VGPR-NEXT: v_mov_b32_e32 v17, s17
+; VGPR-NEXT: v_mov_b32_e32 v18, s18
+; VGPR-NEXT: v_mov_b32_e32 v19, s19
+; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s12
+; VGPR-NEXT: v_mov_b32_e32 v17, s13
+; VGPR-NEXT: v_mov_b32_e32 v18, s14
+; VGPR-NEXT: v_mov_b32_e32 v19, s15
+; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s8
+; VGPR-NEXT: v_mov_b32_e32 v17, s9
+; VGPR-NEXT: v_mov_b32_e32 v18, s10
+; VGPR-NEXT: v_mov_b32_e32 v19, s11
+; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_endpgm
%result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 2, i32 3)
store volatile <16 x i32> %arg2, ptr addrspace(1) %out
store volatile <16 x i32> %result, ptr addrspace(1) %out
@@ -1760,6 +3466,82 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b32_e32 v0, s20
+; AGPR-NEXT: v_mov_b32_e32 v1, s21
+; AGPR-NEXT: v_mov_b32_e32 v2, s22
+; AGPR-NEXT: v_mov_b32_e32 v3, s23
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: v_mov_b32_e32 v4, s24
+; AGPR-NEXT: v_mov_b32_e32 v5, s25
+; AGPR-NEXT: v_mov_b32_e32 v6, s26
+; AGPR-NEXT: v_mov_b32_e32 v7, s27
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT: v_mov_b32_e32 v0, 0
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: s_nop 2
+; AGPR-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; AGPR-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; AGPR-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; AGPR-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v16, s20
+; VGPR-NEXT: v_mov_b32_e32 v17, s21
+; VGPR-NEXT: v_mov_b32_e32 v18, s22
+; VGPR-NEXT: v_mov_b32_e32 v19, s23
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: v_mov_b32_e32 v20, s24
+; VGPR-NEXT: v_mov_b32_e32 v21, s25
+; VGPR-NEXT: v_mov_b32_e32 v22, s26
+; VGPR-NEXT: v_mov_b32_e32 v23, s27
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; VGPR-NEXT: v_mov_b32_e32 v16, 0
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 2
+; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPR-NEXT: s_endpgm
%result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
store <16 x i32> %result, ptr addrspace(1) %out
ret void
@@ -1844,6 +3626,82 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b32_e32 v0, s20
+; AGPR-NEXT: v_mov_b32_e32 v1, s21
+; AGPR-NEXT: v_mov_b32_e32 v2, s22
+; AGPR-NEXT: v_mov_b32_e32 v3, s23
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: v_mov_b32_e32 v4, s24
+; AGPR-NEXT: v_mov_b32_e32 v5, s25
+; AGPR-NEXT: v_mov_b32_e32 v6, s26
+; AGPR-NEXT: v_mov_b32_e32 v7, s27
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
+; AGPR-NEXT: v_mov_b32_e32 v0, 0
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: s_nop 2
+; AGPR-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; AGPR-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; AGPR-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; AGPR-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v16, s20
+; VGPR-NEXT: v_mov_b32_e32 v17, s21
+; VGPR-NEXT: v_mov_b32_e32 v18, s22
+; VGPR-NEXT: v_mov_b32_e32 v19, s23
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: v_mov_b32_e32 v20, s24
+; VGPR-NEXT: v_mov_b32_e32 v21, s25
+; VGPR-NEXT: v_mov_b32_e32 v22, s26
+; VGPR-NEXT: v_mov_b32_e32 v23, s27
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; VGPR-NEXT: v_mov_b32_e32 v16, 0
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 2
+; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPR-NEXT: s_endpgm
%result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 3, i32 2, i32 1)
store <16 x i32> %result, ptr addrspace(1) %out
ret void
@@ -1871,6 +3729,28 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; AGPR-LABEL: test_mfma_f32_16x16x32_bf16:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT: v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPR-LABEL: test_mfma_f32_16x16x32_bf16:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
ret <4 x float> %result
}
@@ -1891,6 +3771,28 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; AGPR-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT: v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPR-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
ret <4 x float> %result
}
@@ -1916,6 +3818,46 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; GCN-NEXT: s_nop 7
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; GCN-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; AGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; AGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; AGPR-NEXT: v_mov_b32_e32 v8, 0
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s0
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s1
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s2
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s3
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPR-NEXT: v_mov_b32_e32 v12, 0
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPR-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
store <4 x float> %result, ptr addrspace(1) %out
ret void
@@ -1942,6 +3884,46 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; GCN-NEXT: s_nop 7
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; GCN-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; AGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; AGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; AGPR-NEXT: v_mov_b32_e32 v8, 0
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s0
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s1
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s2
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s3
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPR-NEXT: v_mov_b32_e32 v12, 0
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPR-NEXT: s_endpgm
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
store <4 x float> %result, ptr addrspace(1) %out
ret void
>From aec8ea7ecbcc5c88f2951f37a6c1054d1cc31310 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 11 Jul 2025 12:29:52 -0700
Subject: [PATCH 2/5] Use binary flag
Change-Id: Ic76c5f834352c2d0fc893332733cbb6f2382f2f7
---
.../Target/AMDGPU/SIMachineFunctionInfo.cpp | 27 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 1984 ++++++++++++++++-
2 files changed, 1960 insertions(+), 51 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index a6bd8db1c65f4..52c25c64fd52b 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,23 +29,11 @@ enum { MAX_LANES = 64 };
using namespace llvm;
-namespace {
-enum MFMARegClass {
- Unspecified,
- VGPR,
- AGPR,
-};
-}
-
-cl::opt<MFMARegClass>
- MFMAForm("amdgpu-mfma-form", cl::Hidden,
- cl::desc("Register class to use for Opc and Dest of MFMA. If "
+cl::opt<bool>
+ MFMAVGPRForm("amdgpu-mfma-vgpr-form", cl::Hidden,
+ cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
"unspecified, default to compiler heuristics"),
- cl::init(MFMARegClass::Unspecified),
- cl::values(clEnumValN(MFMARegClass::VGPR, "vgpr",
- "Use the VGPR MFMA form."),
- clEnumValN(MFMARegClass::AGPR, "agpr",
- "Use the VGPR MFMA form.")));
+ cl::init(false));
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
const SITargetLowering *TLI = STI->getTargetLowering();
@@ -87,15 +75,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}
- MayNeedAGPRs = ST.hasMAIInsts();
- if (MFMAForm == MFMARegClass::Unspecified && ST.hasGFX90AInsts() &&
+ MayNeedAGPRs = ST.hasMAIInsts() & !MFMAVGPRForm;
+ if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
!mayUseAGPRs(F))
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
- else if (MFMAForm != MFMARegClass::Unspecified)
- MayNeedAGPRs = MFMAForm == MFMARegClass::AGPR;
-
if (AMDGPU::isChainCC(CC)) {
// Chain functions don't receive an SP from their caller, but are free to
// set one up. For now, we can use s32 to match what amdgpu_gfx functions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 98d2104f584b0..866dba7746565 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-form=agpr < %s | FileCheck -enable-var-scope --check-prefixes=AGPR %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-form=vgpr < %s | FileCheck -enable-var-scope --check-prefixes=VGPR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
@@ -28,6 +28,27 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_f16:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
; AGPR-LABEL: test_mfma_f32_16x16x32_f16:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -43,7 +64,6 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg
; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
; AGPR-NEXT: s_setpc_b64 s[30:31]
-;
; VGPR-LABEL: test_mfma_f32_16x16x32_f16:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -70,6 +90,27 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
; AGPR-LABEL: test_mfma_f32_16x16x32_f16__flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -85,7 +126,6 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal
; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
; AGPR-NEXT: s_setpc_b64 s[30:31]
-;
; VGPR-LABEL: test_mfma_f32_16x16x32_f16__flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -138,6 +178,45 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
@@ -158,7 +237,6 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; AGPR-NEXT: s_nop 7
; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
@@ -225,6 +303,45 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
@@ -245,7 +362,6 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; AGPR-NEXT: s_nop 7
; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
@@ -398,6 +514,132 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], 0
+; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v0, s20
+; HEURRC-NEXT: v_mov_b32_e32 v1, s21
+; HEURRC-NEXT: v_mov_b32_e32 v2, s22
+; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v9, s17
+; HEURRC-NEXT: v_mov_b32_e32 v10, s18
+; HEURRC-NEXT: v_mov_b32_e32 v11, s19
+; HEURRC-NEXT: s_nop 4
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s12
+; HEURRC-NEXT: v_mov_b32_e32 v1, s13
+; HEURRC-NEXT: v_mov_b32_e32 v2, s14
+; HEURRC-NEXT: v_mov_b32_e32 v3, s15
+; HEURRC-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT: v_mov_b64_e32 v[50:51], 0
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
+; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -464,7 +706,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; AGPR-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_f32_32x32x16_f16:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -655,6 +896,132 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], 16
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], 0
+; HEURRC-NEXT: v_mov_b32_e32 v8, s16
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v0, s20
+; HEURRC-NEXT: v_mov_b32_e32 v1, s21
+; HEURRC-NEXT: v_mov_b32_e32 v2, s22
+; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v9, s17
+; HEURRC-NEXT: v_mov_b32_e32 v10, s18
+; HEURRC-NEXT: v_mov_b32_e32 v11, s19
+; HEURRC-NEXT: s_nop 4
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s12
+; HEURRC-NEXT: v_mov_b32_e32 v1, s13
+; HEURRC-NEXT: v_mov_b32_e32 v2, s14
+; HEURRC-NEXT: v_mov_b32_e32 v3, s15
+; HEURRC-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[48:49], 16
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT: v_mov_b64_e32 v[50:51], 0
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s16
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s19
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
+; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -721,7 +1088,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; AGPR-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -829,6 +1195,70 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: s_nop 3
+; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT: v_accvgpr_read_b32 v4, a4
+; HEURRC-NEXT: v_accvgpr_read_b32 v5, a5
+; HEURRC-NEXT: v_accvgpr_read_b32 v6, a6
+; HEURRC-NEXT: v_accvgpr_read_b32 v7, a7
+; HEURRC-NEXT: v_accvgpr_read_b32 v8, a8
+; HEURRC-NEXT: v_accvgpr_read_b32 v9, a9
+; HEURRC-NEXT: v_accvgpr_read_b32 v10, a10
+; HEURRC-NEXT: v_accvgpr_read_b32 v11, a11
+; HEURRC-NEXT: v_accvgpr_read_b32 v12, a12
+; HEURRC-NEXT: v_accvgpr_read_b32 v13, a13
+; HEURRC-NEXT: v_accvgpr_read_b32 v14, a14
+; HEURRC-NEXT: v_accvgpr_read_b32 v15, a15
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23]
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: s_nop 3
+; VGPRRC-NEXT: v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, v11
+; VGPRRC-NEXT: v_mov_b32_e32 v4, v12
+; VGPRRC-NEXT: v_mov_b32_e32 v5, v13
+; VGPRRC-NEXT: v_mov_b32_e32 v6, v14
+; VGPRRC-NEXT: v_mov_b32_e32 v7, v15
+; VGPRRC-NEXT: v_mov_b32_e32 v8, v16
+; VGPRRC-NEXT: v_mov_b32_e32 v9, v17
+; VGPRRC-NEXT: v_mov_b32_e32 v10, v18
+; VGPRRC-NEXT: v_mov_b32_e32 v11, v19
+; VGPRRC-NEXT: v_mov_b32_e32 v12, v20
+; VGPRRC-NEXT: v_mov_b32_e32 v13, v21
+; VGPRRC-NEXT: v_mov_b32_e32 v14, v22
+; VGPRRC-NEXT: v_mov_b32_e32 v15, v23
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__mac:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -869,7 +1299,6 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
; AGPR-NEXT: v_accvgpr_read_b32 v14, a14
; AGPR-NEXT: v_accvgpr_read_b32 v15, a15
; AGPR-NEXT: s_setpc_b64 s[30:31]
-;
; VGPR-LABEL: test_mfma_f32_32x32x16_f16__mac:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -939,6 +1368,70 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: s_nop 3
+; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT: v_accvgpr_read_b32 v4, a4
+; HEURRC-NEXT: v_accvgpr_read_b32 v5, a5
+; HEURRC-NEXT: v_accvgpr_read_b32 v6, a6
+; HEURRC-NEXT: v_accvgpr_read_b32 v7, a7
+; HEURRC-NEXT: v_accvgpr_read_b32 v8, a8
+; HEURRC-NEXT: v_accvgpr_read_b32 v9, a9
+; HEURRC-NEXT: v_accvgpr_read_b32 v10, a10
+; HEURRC-NEXT: v_accvgpr_read_b32 v11, a11
+; HEURRC-NEXT: v_accvgpr_read_b32 v12, a12
+; HEURRC-NEXT: v_accvgpr_read_b32 v13, a13
+; HEURRC-NEXT: v_accvgpr_read_b32 v14, a14
+; HEURRC-NEXT: v_accvgpr_read_b32 v15, a15
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: s_nop 3
+; VGPRRC-NEXT: v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, v11
+; VGPRRC-NEXT: v_mov_b32_e32 v4, v12
+; VGPRRC-NEXT: v_mov_b32_e32 v5, v13
+; VGPRRC-NEXT: v_mov_b32_e32 v6, v14
+; VGPRRC-NEXT: v_mov_b32_e32 v7, v15
+; VGPRRC-NEXT: v_mov_b32_e32 v8, v16
+; VGPRRC-NEXT: v_mov_b32_e32 v9, v17
+; VGPRRC-NEXT: v_mov_b32_e32 v10, v18
+; VGPRRC-NEXT: v_mov_b32_e32 v11, v19
+; VGPRRC-NEXT: v_mov_b32_e32 v12, v20
+; VGPRRC-NEXT: v_mov_b32_e32 v13, v21
+; VGPRRC-NEXT: v_mov_b32_e32 v14, v22
+; VGPRRC-NEXT: v_mov_b32_e32 v15, v23
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -979,7 +1472,6 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
; AGPR-NEXT: v_accvgpr_read_b32 v14, a14
; AGPR-NEXT: v_accvgpr_read_b32 v15, a15
; AGPR-NEXT: s_setpc_b64 s[30:31]
-;
; VGPR-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1126,6 +1618,126 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v8, s20
+; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
+; HEURRC-NEXT: v_mov_b32_e32 v10, s22
+; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: v_mov_b32_e32 v0, s16
+; HEURRC-NEXT: v_mov_b32_e32 v1, s17
+; HEURRC-NEXT: v_mov_b32_e32 v2, s18
+; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s12
+; HEURRC-NEXT: v_mov_b32_e32 v1, s13
+; HEURRC-NEXT: v_mov_b32_e32 v2, s14
+; HEURRC-NEXT: v_mov_b32_e32 v3, s15
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT: v_mov_b32_e32 v44, 0
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s21
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 2
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1189,7 +1801,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; AGPR-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1371,6 +1982,126 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v8, s20
+; HEURRC-NEXT: v_mov_b32_e32 v9, s21
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: v_mov_b32_e32 v10, s22
+; HEURRC-NEXT: v_mov_b32_e32 v11, s23
+; HEURRC-NEXT: v_mov_b32_e32 v0, s16
+; HEURRC-NEXT: v_mov_b32_e32 v1, s17
+; HEURRC-NEXT: v_mov_b32_e32 v2, s18
+; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s12
+; HEURRC-NEXT: v_mov_b32_e32 v1, s13
+; HEURRC-NEXT: v_mov_b32_e32 v2, s14
+; HEURRC-NEXT: v_mov_b32_e32 v3, s15
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT: v_mov_b32_e32 v44, 0
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s21
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 2
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1434,7 +2165,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; AGPR-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1572,6 +2302,71 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: s_nop 2
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1608,7 +2403,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; AGPR-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; AGPR-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1717,6 +2511,71 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: s_nop 2
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1753,7 +2612,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; AGPR-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; AGPR-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1810,6 +2668,27 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
+; HEURRC-LABEL: test_mfma_i32_16x16x64_i8:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
; AGPR-LABEL: test_mfma_i32_16x16x64_i8:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1825,7 +2704,6 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4
; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
; AGPR-NEXT: s_setpc_b64 s[30:31]
-;
; VGPR-LABEL: test_mfma_i32_16x16x64_i8:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1852,6 +2730,27 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
+; HEURRC-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
; AGPR-LABEL: test_mfma_i32_16x16x64_i8__flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1867,7 +2766,6 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a
; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
; AGPR-NEXT: s_setpc_b64 s[30:31]
-;
; VGPR-LABEL: test_mfma_i32_16x16x64_i8__flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1924,6 +2822,55 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v4, s12
+; HEURRC-NEXT: v_mov_b32_e32 v5, s13
+; HEURRC-NEXT: v_mov_b32_e32 v6, s14
+; HEURRC-NEXT: v_mov_b32_e32 v7, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
+; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
+; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
+; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
+; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
@@ -1948,7 +2895,6 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; AGPR-NEXT: s_nop 7
; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
@@ -2025,6 +2971,55 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v4, s12
+; HEURRC-NEXT: v_mov_b32_e32 v5, s13
+; HEURRC-NEXT: v_mov_b32_e32 v6, s14
+; HEURRC-NEXT: v_mov_b32_e32 v7, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
+; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
+; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
+; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
+; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
@@ -2049,7 +3044,6 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; AGPR-NEXT: s_nop 7
; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
@@ -2216,6 +3210,144 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b32_e32 v0, s24
+; HEURRC-NEXT: v_mov_b32_e32 v1, s25
+; HEURRC-NEXT: v_mov_b32_e32 v2, s26
+; HEURRC-NEXT: v_mov_b32_e32 v3, s27
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v4, s28
+; HEURRC-NEXT: v_mov_b32_e32 v5, s29
+; HEURRC-NEXT: v_mov_b32_e32 v6, s30
+; HEURRC-NEXT: v_mov_b32_e32 v7, s31
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v0, s16
+; HEURRC-NEXT: v_mov_b32_e32 v1, s17
+; HEURRC-NEXT: v_mov_b32_e32 v2, s18
+; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s20
+; HEURRC-NEXT: v_mov_b32_e32 v1, s21
+; HEURRC-NEXT: v_mov_b32_e32 v2, s22
+; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s12
+; HEURRC-NEXT: v_mov_b32_e32 v1, s13
+; HEURRC-NEXT: v_mov_b32_e32 v2, s14
+; HEURRC-NEXT: v_mov_b32_e32 v3, s15
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b32_e32 v32, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v33, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v34, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s28
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s29
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s30
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: s_nop 3
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s16
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s19
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2288,7 +3420,6 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_i32_32x32x32_i8:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2491,6 +3622,144 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b32_e32 v0, s24
+; HEURRC-NEXT: v_mov_b32_e32 v1, s25
+; HEURRC-NEXT: v_mov_b32_e32 v2, s26
+; HEURRC-NEXT: v_mov_b32_e32 v3, s27
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v4, s28
+; HEURRC-NEXT: v_mov_b32_e32 v5, s29
+; HEURRC-NEXT: v_mov_b32_e32 v6, s30
+; HEURRC-NEXT: v_mov_b32_e32 v7, s31
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v0, s16
+; HEURRC-NEXT: v_mov_b32_e32 v1, s17
+; HEURRC-NEXT: v_mov_b32_e32 v2, s18
+; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s20
+; HEURRC-NEXT: v_mov_b32_e32 v1, s21
+; HEURRC-NEXT: v_mov_b32_e32 v2, s22
+; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s12
+; HEURRC-NEXT: v_mov_b32_e32 v1, s13
+; HEURRC-NEXT: v_mov_b32_e32 v2, s14
+; HEURRC-NEXT: v_mov_b32_e32 v3, s15
+; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b32_e32 v32, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v33, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v34, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v35, s27
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s28
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s29
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s30
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s31
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: s_nop 3
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s16
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s19
+; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s23
+; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s15
+; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2563,7 +3832,6 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2677,6 +3945,70 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: s_nop 3
+; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT: v_accvgpr_read_b32 v4, a4
+; HEURRC-NEXT: v_accvgpr_read_b32 v5, a5
+; HEURRC-NEXT: v_accvgpr_read_b32 v6, a6
+; HEURRC-NEXT: v_accvgpr_read_b32 v7, a7
+; HEURRC-NEXT: v_accvgpr_read_b32 v8, a8
+; HEURRC-NEXT: v_accvgpr_read_b32 v9, a9
+; HEURRC-NEXT: v_accvgpr_read_b32 v10, a10
+; HEURRC-NEXT: v_accvgpr_read_b32 v11, a11
+; HEURRC-NEXT: v_accvgpr_read_b32 v12, a12
+; HEURRC-NEXT: v_accvgpr_read_b32 v13, a13
+; HEURRC-NEXT: v_accvgpr_read_b32 v14, a14
+; HEURRC-NEXT: v_accvgpr_read_b32 v15, a15
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23]
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: s_nop 3
+; VGPRRC-NEXT: v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, v11
+; VGPRRC-NEXT: v_mov_b32_e32 v4, v12
+; VGPRRC-NEXT: v_mov_b32_e32 v5, v13
+; VGPRRC-NEXT: v_mov_b32_e32 v6, v14
+; VGPRRC-NEXT: v_mov_b32_e32 v7, v15
+; VGPRRC-NEXT: v_mov_b32_e32 v8, v16
+; VGPRRC-NEXT: v_mov_b32_e32 v9, v17
+; VGPRRC-NEXT: v_mov_b32_e32 v10, v18
+; VGPRRC-NEXT: v_mov_b32_e32 v11, v19
+; VGPRRC-NEXT: v_mov_b32_e32 v12, v20
+; VGPRRC-NEXT: v_mov_b32_e32 v13, v21
+; VGPRRC-NEXT: v_mov_b32_e32 v14, v22
+; VGPRRC-NEXT: v_mov_b32_e32 v15, v23
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__mac:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2717,7 +4049,6 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
; AGPR-NEXT: v_accvgpr_read_b32 v14, a14
; AGPR-NEXT: v_accvgpr_read_b32 v15, a15
; AGPR-NEXT: s_setpc_b64 s[30:31]
-;
; VGPR-LABEL: test_mfma_i32_32x32x32_i8__mac:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2787,6 +4118,70 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: s_nop 3
+; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT: v_accvgpr_read_b32 v4, a4
+; HEURRC-NEXT: v_accvgpr_read_b32 v5, a5
+; HEURRC-NEXT: v_accvgpr_read_b32 v6, a6
+; HEURRC-NEXT: v_accvgpr_read_b32 v7, a7
+; HEURRC-NEXT: v_accvgpr_read_b32 v8, a8
+; HEURRC-NEXT: v_accvgpr_read_b32 v9, a9
+; HEURRC-NEXT: v_accvgpr_read_b32 v10, a10
+; HEURRC-NEXT: v_accvgpr_read_b32 v11, a11
+; HEURRC-NEXT: v_accvgpr_read_b32 v12, a12
+; HEURRC-NEXT: v_accvgpr_read_b32 v13, a13
+; HEURRC-NEXT: v_accvgpr_read_b32 v14, a14
+; HEURRC-NEXT: v_accvgpr_read_b32 v15, a15
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: s_nop 3
+; VGPRRC-NEXT: v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, v11
+; VGPRRC-NEXT: v_mov_b32_e32 v4, v12
+; VGPRRC-NEXT: v_mov_b32_e32 v5, v13
+; VGPRRC-NEXT: v_mov_b32_e32 v6, v14
+; VGPRRC-NEXT: v_mov_b32_e32 v7, v15
+; VGPRRC-NEXT: v_mov_b32_e32 v8, v16
+; VGPRRC-NEXT: v_mov_b32_e32 v9, v17
+; VGPRRC-NEXT: v_mov_b32_e32 v10, v18
+; VGPRRC-NEXT: v_mov_b32_e32 v11, v19
+; VGPRRC-NEXT: v_mov_b32_e32 v12, v20
+; VGPRRC-NEXT: v_mov_b32_e32 v13, v21
+; VGPRRC-NEXT: v_mov_b32_e32 v14, v22
+; VGPRRC-NEXT: v_mov_b32_e32 v15, v23
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2827,7 +4222,6 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
; AGPR-NEXT: v_accvgpr_read_b32 v14, a14
; AGPR-NEXT: v_accvgpr_read_b32 v15, a15
; AGPR-NEXT: s_setpc_b64 s[30:31]
-;
; VGPR-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2981,6 +4375,140 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b32_e32 v0, s20
+; HEURRC-NEXT: v_mov_b32_e32 v1, s21
+; HEURRC-NEXT: v_mov_b32_e32 v2, s22
+; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT: v_mov_b32_e32 v4, s24
+; HEURRC-NEXT: v_mov_b32_e32 v5, s25
+; HEURRC-NEXT: v_mov_b32_e32 v6, s26
+; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
+; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31]
+; HEURRC-NEXT: v_mov_b32_e32 v0, s20
+; HEURRC-NEXT: v_mov_b32_e32 v1, s21
+; HEURRC-NEXT: v_mov_b32_e32 v2, s22
+; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s16
+; HEURRC-NEXT: v_mov_b32_e32 v1, s17
+; HEURRC-NEXT: v_mov_b32_e32 v2, s18
+; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s12
+; HEURRC-NEXT: v_mov_b32_e32 v1, s13
+; HEURRC-NEXT: v_mov_b32_e32 v2, s14
+; HEURRC-NEXT: v_mov_b32_e32 v3, s15
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; VGPRRC-NEXT: s_nop 6
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
@@ -3051,7 +4579,6 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; AGPR-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
@@ -3247,6 +4774,140 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b32_e32 v0, s20
+; HEURRC-NEXT: v_mov_b32_e32 v1, s21
+; HEURRC-NEXT: v_mov_b32_e32 v2, s22
+; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT: v_mov_b32_e32 v4, s24
+; HEURRC-NEXT: v_mov_b32_e32 v5, s25
+; HEURRC-NEXT: v_mov_b32_e32 v6, s26
+; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
+; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: v_mov_b32_e32 v0, s20
+; HEURRC-NEXT: v_mov_b32_e32 v1, s21
+; HEURRC-NEXT: v_mov_b32_e32 v2, s22
+; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s16
+; HEURRC-NEXT: v_mov_b32_e32 v1, s17
+; HEURRC-NEXT: v_mov_b32_e32 v2, s18
+; HEURRC-NEXT: v_mov_b32_e32 v3, s19
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s12
+; HEURRC-NEXT: v_mov_b32_e32 v1, s13
+; HEURRC-NEXT: v_mov_b32_e32 v2, s14
+; HEURRC-NEXT: v_mov_b32_e32 v3, s15
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_nop 0
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: s_waitcnt vmcnt(0)
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPRRC-NEXT: s_nop 6
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_nop 0
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: s_waitcnt vmcnt(0)
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
@@ -3317,7 +4978,6 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; AGPR-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
; AGPR-NEXT: s_waitcnt vmcnt(0)
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
@@ -3467,6 +5127,81 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b32_e32 v0, s20
+; HEURRC-NEXT: v_mov_b32_e32 v1, s21
+; HEURRC-NEXT: v_mov_b32_e32 v2, s22
+; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT: v_mov_b32_e32 v4, s24
+; HEURRC-NEXT: v_mov_b32_e32 v5, s25
+; HEURRC-NEXT: v_mov_b32_e32 v6, s26
+; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT: v_mov_b32_e32 v20, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v21, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v22, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v23, s27
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: s_nop 2
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
@@ -3508,7 +5243,6 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; AGPR-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; AGPR-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
@@ -3627,6 +5361,81 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b32_e32 v0, s20
+; HEURRC-NEXT: v_mov_b32_e32 v1, s21
+; HEURRC-NEXT: v_mov_b32_e32 v2, s22
+; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; HEURRC-NEXT: v_mov_b32_e32 v4, s24
+; HEURRC-NEXT: v_mov_b32_e32 v5, s25
+; HEURRC-NEXT: v_mov_b32_e32 v6, s26
+; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
+; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
+; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
+; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
+; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
+; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
+; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
+; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
+; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
+; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
+; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
+; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
+; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPRRC-NEXT: v_mov_b32_e32 v20, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v21, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v22, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v23, s27
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT: v_mov_b32_e32 v16, 0
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: s_nop 2
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPRRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
@@ -3668,7 +5477,6 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; AGPR-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; AGPR-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
@@ -3730,6 +5538,27 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
; AGPR-LABEL: test_mfma_f32_16x16x32_bf16:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3745,7 +5574,6 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
; AGPR-NEXT: s_setpc_b64 s[30:31]
-;
; VGPR-LABEL: test_mfma_f32_16x16x32_bf16:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3772,6 +5600,27 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
; AGPR-LABEL: test_mfma_f32_16x16x32_bf16__flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3787,7 +5636,6 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
; AGPR-NEXT: s_setpc_b64 s[30:31]
-;
; VGPR-LABEL: test_mfma_f32_16x16x32_bf16__flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3819,6 +5667,45 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; GCN-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
@@ -3839,7 +5726,6 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; AGPR-NEXT: s_nop 7
; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
@@ -3885,6 +5771,45 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; GCN-NEXT: s_endpgm
;
+; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: s_endpgm
+;
+; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
+; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT: s_nop 1
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT: s_nop 7
+; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
; AGPR: ; %bb.0:
; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
@@ -3905,7 +5830,6 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; AGPR-NEXT: s_nop 7
; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
; AGPR-NEXT: s_endpgm
-;
; VGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
; VGPR: ; %bb.0:
; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
>From a9377e4e702a9a88da8ddfee77ab3911ed1167ec Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 11 Jul 2025 12:54:58 -0700
Subject: [PATCH 3/5] Add test coverage for amdgpu-agpr-alloc vs
amdgpu-mfma-vgpr-form
Change-Id: I219dcb465637cb10e1997846f22415d5e3ab5a49
---
.../Target/AMDGPU/SIMachineFunctionInfo.cpp | 2 +-
.../CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll | 76 +++++++++++++++++++
2 files changed, 77 insertions(+), 1 deletion(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 52c25c64fd52b..429c7ee5bebf0 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -75,7 +75,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}
- MayNeedAGPRs = ST.hasMAIInsts() & !MFMAVGPRForm;
+ MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
!mayUseAGPRs(F))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
new file mode 100644
index 0000000000000..87a7c2ef6c95c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s
+
+declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
+; HEURRC-LABEL: default:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: default:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 {
+; HEURRC-LABEL: request_agpr:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
+; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
+; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
+; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
+; HEURRC-NEXT: s_nop 1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: s_nop 7
+; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
+; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
+; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
+; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: request_agpr:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+define <4 x float> @request_no_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #1 {
+; HEURRC-LABEL: request_no_agpr:
+; HEURRC: ; %bb.0:
+; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; HEURRC-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPRRC-LABEL: request_no_agpr:
+; VGPRRC: ; %bb.0:
+; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT: s_setpc_b64 s[30:31]
+ %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
+ ret <4 x float> %result
+}
+
+attributes #0 = { "amdgpu-agpr-alloc"="32,256" }
+attributes #1 = { "amdgpu-agpr-alloc"="0,0" }
>From 4917b9c32de1a2dc491b55135ccf764a9bf2882f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Fri, 11 Jul 2025 13:00:09 -0700
Subject: [PATCH 4/5] Formatting
Change-Id: I0740caff1b1453902edd3f1037852c38d37d7117
---
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 429c7ee5bebf0..3e45698772a66 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,11 +29,11 @@ enum { MAX_LANES = 64 };
using namespace llvm;
-cl::opt<bool>
- MFMAVGPRForm("amdgpu-mfma-vgpr-form", cl::Hidden,
- cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
- "unspecified, default to compiler heuristics"),
- cl::init(false));
+cl::opt<bool> MFMAVGPRForm(
+ "amdgpu-mfma-vgpr-form", cl::Hidden,
+ cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
+ "unspecified, default to compiler heuristics"),
+ cl::init(false));
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
const SITargetLowering *TLI = STI->getTargetLowering();
>From 7f89718779b62dab8801ab18cf583b6962b45ecb Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 14 Jul 2025 10:38:25 -0700
Subject: [PATCH 5/5] Add TODO comment
Change-Id: I7c30d3d3384d5a63bea9a8b4f110260165026911
---
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 3e45698772a66..ead6ca633fc74 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -29,6 +29,10 @@ enum { MAX_LANES = 64 };
using namespace llvm;
+// TODO -- delete this flag once we have more robust mechanisms to allocate the
+// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
+// where it is better to produce the VGPR form (e.g. if there are VGPR users
+// of the MFMA result).
cl::opt<bool> MFMAVGPRForm(
"amdgpu-mfma-vgpr-form", cl::Hidden,
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
More information about the llvm-commits
mailing list