[llvm] [AMDGPU] Post-RA AGPR copy elimination pass (PR #153901)
Lucas Ramirez via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 15 16:18:23 PDT 2025
https://github.com/lucas-rami created https://github.com/llvm/llvm-project/pull/153901
This introduces a post-RA pass to eliminate useless AGPR to AVGPR copies, effectively folding them into MFMAs' OpA and OpB when possible. These cross-class copies appear during register allocation when splitting live ranges lead to VGPR to AVGPR inflation on some parts of the original live range. RA is then unable to fold at least part of these copies.
The pass eliminates copies in the following kind of situations, making the MFMA use `a[0:3]` as `<opA>` instead.
```
v_accvgpr_read_b32 v0, a0
v_accvgpr_read_b32 v1, a1
v_accvgpr_read_b32 v2, a2
v_accvgpr_read_b32 v3, a3
v_mfma_f32_16x16x32_fp8_fp8 <dst>, v[0:3], <opB>, <opC>
```
This is still somewhat WIP and for now just seeking any feedback. Two questions I have in particular.
1. Should what this pass does be integrated in the `AMDGPURewriteAGPRCopyMFMA` pass instead of being its own thing?
2. This may be a good short-term solution to get rid of this kind of copies, but the pass is somewhat addressing the symptom rather than the cause of the issue, which is in the register allocator. Is this something we should strive to integrate at all?
>From 1df88d26b77b6968050c1dc4961b27ecb8fb6b04 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Fri, 15 Aug 2025 22:50:59 +0000
Subject: [PATCH] AGPR to VGPR copy elimination
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 11 +
.../AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp | 246 +++++
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 4 +
llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 946 +++++++++++-------
.../CodeGen/AMDGPU/sgpr-regalloc-flags.ll | 4 +
llvm/test/CodeGen/AMDGPU/spill-agpr.ll | 38 +-
.../secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 +
10 files changed, 842 insertions(+), 412 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 007b481f84960..b83e62f4ea57c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -541,6 +541,17 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;
+class AMDGPUEliminateAGPRToVGPRCopyPass
+ : public PassInfoMixin<AMDGPUEliminateAGPRToVGPRCopyPass> {
+public:
+ AMDGPUEliminateAGPRToVGPRCopyPass() = default;
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
+void initializeAMDGPUEliminateAGPRToVGPRCopyLegacyPass(PassRegistry &);
+extern char &AMDGPUEliminateAGPRToVGPRCopyLegacyID;
+
class AMDGPURewriteAGPRCopyMFMAPass
: public PassInfoMixin<AMDGPURewriteAGPRCopyMFMAPass> {
public:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
new file mode 100644
index 0000000000000..66e00e13149de
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
@@ -0,0 +1,246 @@
+//===-- AMDGPUEliminateAGPRToVGPRCopy.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file \brief TODO
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-eliminate-agpr-to-vgpr-copy"
+
+STATISTIC(NumEliminated, "Number of copies eliminated");
+
+namespace {
+
+class AMDGPUEliminateAGPRToVGPRCopyImpl {
+ const GCNSubtarget &ST;
+ const SIInstrInfo &TII;
+ const SIRegisterInfo &TRI;
+ MachineRegisterInfo &MRI;
+ VirtRegMap &VRM;
+ LiveRegMatrix &LRM;
+ LiveIntervals &LIS;
+
+public:
+ AMDGPUEliminateAGPRToVGPRCopyImpl(MachineFunction &MF, VirtRegMap &VRM,
+ LiveRegMatrix &LRM, LiveIntervals &LIS)
+ : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+ TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
+ LIS(LIS) {}
+
+ bool areAllUsesCompatible(Register Reg) const;
+
+ bool run(MachineFunction &MF) const;
+};
+
+bool AMDGPUEliminateAGPRToVGPRCopyImpl::areAllUsesCompatible(
+ Register Reg) const {
+ return all_of(MRI.use_operands(Reg), [&](const MachineOperand &MO) {
+ const MachineInstr &ParentMI = *MO.getParent();
+ if (!SIInstrInfo::isMFMA(ParentMI))
+ return false;
+ return &MO == TII.getNamedOperand(ParentMI, AMDGPU::OpName::src0) ||
+ &MO == TII.getNamedOperand(ParentMI, AMDGPU::OpName::src1);
+ });
+}
+
+bool AMDGPUEliminateAGPRToVGPRCopyImpl::run(MachineFunction &MF) const {
+ // This only applies on subtargets that have a configurable AGPR vs. VGPR
+ // allocation.
+ if (!ST.hasGFX90AInsts())
+ return false;
+
+ // Early exit if no AGPRs were assigned.
+ if (!LRM.isPhysRegUsed(AMDGPU::AGPR0))
+ return false;
+
+ bool MadeChange = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &CopyMI : make_early_inc_range(MBB)) {
+ // Find full copies...
+ if (!CopyMI.isFullCopy())
+ continue;
+
+ // ... whose destination was mapped to a VGPR or AGPR...
+ Register DstReg = CopyMI.getOperand(0).getReg();
+ if (!DstReg.isVirtual())
+ continue;
+ Register DstPhysReg = VRM.getPhys(DstReg);
+ if (!DstPhysReg)
+ continue;
+ const TargetRegisterClass *DstRC = TRI.getPhysRegBaseClass(DstPhysReg);
+ if (!TRI.hasVectorRegisters(DstRC) || TRI.hasSGPRs(DstRC))
+ continue;
+
+ // ... and whose source was mapped to an AGPR.
+ Register SrcReg = CopyMI.getOperand(1).getReg();
+ if (!SrcReg.isVirtual() || SrcReg == DstReg)
+ continue;
+ Register SrcPhysReg = VRM.getPhys(SrcReg);
+ if (!SrcPhysReg)
+ continue;
+ const TargetRegisterClass *SrcRC = TRI.getPhysRegBaseClass(SrcPhysReg);
+ if (!TRI.isAGPRClass(SrcRC))
+ continue;
+
+ bool DstIsAGPR = TRI.hasAGPRs(DstRC);
+
+ LLVM_DEBUG({
+ dbgs() << "AGPR->AVGPR copy: " << CopyMI;
+ dbgs() << " "
+ << printReg(DstReg, &TRI, CopyMI.getOperand(0).getSubReg(), &MRI)
+ << " <-> " << printReg(DstPhysReg, &TRI, 0, &MRI) << "\n";
+ dbgs() << " "
+ << printReg(SrcReg, &TRI, CopyMI.getOperand(1).getSubReg(), &MRI)
+ << " <-> " << printReg(SrcPhysReg, &TRI, 0, &MRI) << "\n";
+ });
+
+ LiveInterval &SrcLI = LIS.getInterval(SrcReg);
+ const VNInfo *SrcVNI = SrcLI.getVNInfoAt(LIS.getInstructionIndex(CopyMI));
+ assert(SrcVNI && "VNI must exist");
+
+ bool AllUsesCompatible =
+ all_of(MRI.use_operands(DstReg), [&](const MachineOperand &MO) {
+ // Destination's use must be src0/src1 operands of an MFMA or
+ // another copy.
+ const MachineInstr &UseMI = *MO.getParent();
+ if (!DstIsAGPR) {
+ if (SIInstrInfo::isMFMA(UseMI)) {
+ if (&MO != TII.getNamedOperand(UseMI, AMDGPU::OpName::src0) &&
+ &MO != TII.getNamedOperand(UseMI, AMDGPU::OpName::src1)) {
+ LLVM_DEBUG(dbgs() << " Incompatible MFMA operand: " << UseMI);
+ return false;
+ }
+ } else if (!UseMI.isFullCopy()){
+ LLVM_DEBUG(dbgs() << " Incompatible user: " << UseMI);
+ return false;
+ }
+ } else {
+ LLVM_DEBUG(dbgs() << " Skipping user check (dst is AGPR)\n");
+ }
+
+ // Source must be available at use point.
+ const VNInfo *UseVNI =
+ SrcLI.getVNInfoAt(LIS.getInstructionIndex(UseMI));
+ if (SrcVNI != UseVNI) {
+ LLVM_DEBUG(dbgs() << " AGPR no longer available at " << UseMI);
+ }
+ return true;
+ });
+ if (!AllUsesCompatible)
+ continue;
+
+ LLVM_DEBUG(dbgs() << " -> Eliminated\n");
+ ++NumEliminated;
+
+ // Remove the copy's destination register.
+ MRI.replaceRegWith(DstReg, SrcReg);
+ LRM.unassign(LIS.getInterval(DstReg));
+ LIS.removeInterval(DstReg);
+
+ // Delete the copy instruction.
+ LIS.RemoveMachineInstrFromMaps(CopyMI);
+ CopyMI.eraseFromParent();
+
+ // Recompute the source register's interval.
+ // TODO: necessary? It is already live at all uses by construction.
+ LIS.removeInterval(SrcReg);
+ LIS.createAndComputeVirtRegInterval(SrcReg);
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+class AMDGPUEliminateAGPRToVGPRCopyLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUEliminateAGPRToVGPRCopyLegacy() : MachineFunctionPass(ID) {
+ initializeAMDGPUEliminateAGPRToVGPRCopyLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Eliminate AGPR-to-VGPR Copy";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervalsWrapperPass>();
+ AU.addRequired<VirtRegMapWrapperLegacy>();
+ AU.addRequired<LiveRegMatrixWrapperLegacy>();
+
+ AU.addPreserved<LiveIntervalsWrapperPass>();
+ AU.addPreserved<VirtRegMapWrapperLegacy>();
+ AU.addPreserved<LiveRegMatrixWrapperLegacy>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPUEliminateAGPRToVGPRCopyLegacy, DEBUG_TYPE,
+ "AMDGPU Eliminate AGPR-to-VGPR Copy", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
+INITIALIZE_PASS_END(AMDGPUEliminateAGPRToVGPRCopyLegacy, DEBUG_TYPE,
+ "AMDGPU Eliminate AGPR-to-VGPR Copy", false, false)
+
+char AMDGPUEliminateAGPRToVGPRCopyLegacy::ID = 0;
+
+char &llvm::AMDGPUEliminateAGPRToVGPRCopyLegacyID =
+ AMDGPUEliminateAGPRToVGPRCopyLegacy::ID;
+
+bool AMDGPUEliminateAGPRToVGPRCopyLegacy::runOnMachineFunction(
+ MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ auto &VRM = getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
+ auto &LRM = getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
+ auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+
+ AMDGPUEliminateAGPRToVGPRCopyImpl Impl(MF, VRM, LRM, LIS);
+ return Impl.run(MF);
+}
+
+PreservedAnalyses
+AMDGPUEliminateAGPRToVGPRCopyPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ VirtRegMap &VRM = MFAM.getResult<VirtRegMapAnalysis>(MF);
+ LiveRegMatrix &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(MF);
+ LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
+
+ AMDGPUEliminateAGPRToVGPRCopyImpl Impl(MF, VRM, LRM, LIS);
+ if (!Impl.run(MF))
+ return PreservedAnalyses::all();
+ auto PA = getMachineFunctionPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index b6c6d927d0e89..34963826d16a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -106,6 +106,7 @@ MACHINE_FUNCTION_ANALYSIS("amdgpu-resource-usage", AMDGPUResourceUsageAnalysis(*
#endif
MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass())
MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
+MACHINE_FUNCTION_PASS("amdgpu-eliminate-agpr-to-vgpr-copy", AMDGPUEliminateAGPRToVGPRCopyPass())
MACHINE_FUNCTION_PASS("amdgpu-mark-last-scratch-load", AMDGPUMarkLastScratchLoadPass())
MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass())
MACHINE_FUNCTION_PASS("amdgpu-reserve-wwm-regs", AMDGPUReserveWWMRegsPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c1f17033d04a8..5512b15f4cb31 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -528,6 +528,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);
+ initializeAMDGPUEliminateAGPRToVGPRCopyLegacyPass(*PR);
initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
@@ -1594,6 +1595,7 @@ bool GCNPassConfig::addPreRewrite() {
if (EnableRegReassign)
addPass(&GCNNSAReassignID);
+ addPass(&AMDGPUEliminateAGPRToVGPRCopyLegacyID);
addPass(&AMDGPURewriteAGPRCopyMFMALegacyID);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c466f9cf0f359..abc8020920e7d 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -53,6 +53,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
AMDGPUCtorDtorLowering.cpp
+ AMDGPUEliminateAGPRToVGPRCopy.cpp
AMDGPUExportClustering.cpp
AMDGPUExportKernelRuntimeHandles.cpp
AMDGPUFrameLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 2a5c65278f7dc..ed865dc62592b 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -377,6 +377,7 @@
; GCN-O1-NEXT: Live Register Matrix
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: GCN NSA Reassign
+; GCN-O1-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; GCN-O1-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; GCN-O1-NEXT: Virtual Register Rewriter
; GCN-O1-NEXT: AMDGPU Mark Last Scratch Load
@@ -689,6 +690,7 @@
; GCN-O1-OPTS-NEXT: Live Register Matrix
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: GCN NSA Reassign
+; GCN-O1-OPTS-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; GCN-O1-OPTS-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
; GCN-O1-OPTS-NEXT: AMDGPU Mark Last Scratch Load
@@ -1007,6 +1009,7 @@
; GCN-O2-NEXT: Live Register Matrix
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: GCN NSA Reassign
+; GCN-O2-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; GCN-O2-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; GCN-O2-NEXT: Virtual Register Rewriter
; GCN-O2-NEXT: AMDGPU Mark Last Scratch Load
@@ -1338,6 +1341,7 @@
; GCN-O3-NEXT: Live Register Matrix
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: GCN NSA Reassign
+; GCN-O3-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; GCN-O3-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; GCN-O3-NEXT: Virtual Register Rewriter
; GCN-O3-NEXT: AMDGPU Mark Last Scratch Load
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 6110b3101020a..fdf4dfd1eaefa 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -103,6 +103,9 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX90A-LABEL: test_mfma_loop_zeroinit:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
@@ -134,14 +137,42 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -149,7 +180,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -163,6 +194,9 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX942-LABEL: test_mfma_loop_zeroinit:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
@@ -194,14 +228,42 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB0_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -209,7 +271,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -338,45 +400,45 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -384,7 +446,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -399,45 +461,45 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB1_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -445,7 +507,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -566,8 +628,11 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
;
; GFX90A-LABEL: test_mfma_loop_non_splat:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
@@ -598,14 +663,42 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -613,7 +706,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -626,8 +719,11 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
;
; GFX942-LABEL: test_mfma_loop_non_splat:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
@@ -658,14 +754,42 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB2_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -673,7 +797,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -865,75 +989,75 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43190000
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43180000
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43170000
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43160000
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43150000
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43140000
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43130000
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43120000
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43110000
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43100000
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430f0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430e0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430d0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430c0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430b0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430a0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43090000
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43080000
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43070000
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43060000
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43050000
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43040000
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43030000
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43020000
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43010000
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43000000
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fe0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fc0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fa0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f80000
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -941,7 +1065,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -957,75 +1081,75 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000
; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43190000
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43180000
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43170000
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43160000
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43150000
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43140000
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43130000
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43120000
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43110000
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43100000
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x430f0000
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x430e0000
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x430d0000
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x430c0000
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x430b0000
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x430a0000
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43090000
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43080000
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43070000
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43060000
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43050000
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43040000
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43030000
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43020000
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43010000
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43000000
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fe0000
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fc0000
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fa0000
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f80000
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB3_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -1033,7 +1157,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1156,45 +1280,45 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -1202,7 +1326,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1217,45 +1341,45 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB4_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -1263,7 +1387,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1429,43 +1553,43 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s1
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -1473,7 +1597,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1492,43 +1616,43 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, s1
; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB5_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -1536,7 +1660,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1695,11 +1819,11 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s1
; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1728,18 +1852,46 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1
; GFX90A-NEXT: ; %bb.2: ; %exit
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
@@ -1759,11 +1911,11 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s1
; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1792,18 +1944,46 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB6_1
; GFX942-NEXT: ; %bb.2: ; %exit
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
@@ -2152,42 +2332,43 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -2195,7 +2376,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2216,42 +2397,43 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB8_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -2259,7 +2441,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index ea6449b99b516..aaca68f3d606f 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -31,6 +31,7 @@
; DEFAULT-NEXT: Live Register Matrix
; DEFAULT-NEXT: Greedy Register Allocator
; DEFAULT-NEXT: GCN NSA Reassign
+; DEFAULT-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; DEFAULT-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; DEFAULT-NEXT: Virtual Register Rewriter
; DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
@@ -78,6 +79,7 @@
; BASIC-DEFAULT-NEXT: Live Register Matrix
; BASIC-DEFAULT-NEXT: Greedy Register Allocator
; BASIC-DEFAULT-NEXT: GCN NSA Reassign
+; BASIC-DEFAULT-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; BASIC-DEFAULT-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
; BASIC-DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
@@ -101,6 +103,7 @@
; DEFAULT-BASIC-NEXT: Live Register Matrix
; DEFAULT-BASIC-NEXT: Basic Register Allocator
; DEFAULT-BASIC-NEXT: GCN NSA Reassign
+; DEFAULT-BASIC-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; DEFAULT-BASIC-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
; DEFAULT-BASIC-NEXT: AMDGPU Mark Last Scratch Load
@@ -130,6 +133,7 @@
; BASIC-BASIC-NEXT: Live Register Matrix
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: GCN NSA Reassign
+; BASIC-BASIC-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; BASIC-BASIC-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; BASIC-BASIC-NEXT: Virtual Register Rewriter
; BASIC-BASIC-NEXT: AMDGPU Mark Last Scratch Load
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index eb0d5465cacd9..b43c19f44614c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -77,26 +77,18 @@ define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg,
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_endpgm
; GFX90A-NEXT: .LBB0_2: ; %use
-; GFX90A-NEXT: s_nop 3
-; GFX90A-NEXT: v_accvgpr_read_b32 v9, a7
-; GFX90A-NEXT: v_accvgpr_read_b32 v8, a6
-; GFX90A-NEXT: v_accvgpr_read_b32 v7, a5
-; GFX90A-NEXT: v_accvgpr_read_b32 v6, a4
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: v_accvgpr_write_b32 a4, 4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a8, 5
; GFX90A-NEXT: v_accvgpr_write_b32 a9, 1
; GFX90A-NEXT: v_accvgpr_write_b32 a10, 2
; GFX90A-NEXT: v_accvgpr_write_b32 a11, 3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v7
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v8
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v9
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
@@ -155,26 +147,14 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a3
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a2
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_endpgm
@@ -331,7 +311,6 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41f80000
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a3, 4.0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2
@@ -339,13 +318,12 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31]
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: global_store_dword v0, a0, s[2:3]
; GFX90A-NEXT: s_endpgm
bb:
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 3d11ce566207a..d907aee996390 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -141,6 +141,7 @@ static_library("LLVMAMDGPUCodeGen") {
"AMDGPUCodeGenPrepare.cpp",
"AMDGPUCombinerHelper.cpp",
"AMDGPUCtorDtorLowering.cpp",
+ "AMDGPUEliminateAGPRToVGPRCopy.cpp",
"AMDGPUExportClustering.cpp",
"AMDGPUExportKernelRuntimeHandles.cpp",
"AMDGPUFrameLowering.cpp",
More information about the llvm-commits
mailing list