[llvm] [AMDGPU] Post-RA AGPR copy elimination pass (PR #153901)

Lucas Ramirez via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 15 16:21:53 PDT 2025


https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/153901

>From 1df88d26b77b6968050c1dc4961b27ecb8fb6b04 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Fri, 15 Aug 2025 22:50:59 +0000
Subject: [PATCH 1/2] AGPR to VGPR copy elimination

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  11 +
 .../AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp  | 246 +++++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   2 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   4 +
 llvm/test/CodeGen/AMDGPU/mfma-loop.ll         | 946 +++++++++++-------
 .../CodeGen/AMDGPU/sgpr-regalloc-flags.ll     |   4 +
 llvm/test/CodeGen/AMDGPU/spill-agpr.ll        |  38 +-
 .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn |   1 +
 10 files changed, 842 insertions(+), 412 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 007b481f84960..b83e62f4ea57c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -541,6 +541,17 @@ extern char &GCNRewritePartialRegUsesID;
 void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
 extern char &AMDGPUWaitSGPRHazardsLegacyID;
 
+class AMDGPUEliminateAGPRToVGPRCopyPass
+    : public PassInfoMixin<AMDGPUEliminateAGPRToVGPRCopyPass> {
+public:
+  AMDGPUEliminateAGPRToVGPRCopyPass() = default;
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+void initializeAMDGPUEliminateAGPRToVGPRCopyLegacyPass(PassRegistry &);
+extern char &AMDGPUEliminateAGPRToVGPRCopyLegacyID;
+
 class AMDGPURewriteAGPRCopyMFMAPass
     : public PassInfoMixin<AMDGPURewriteAGPRCopyMFMAPass> {
 public:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
new file mode 100644
index 0000000000000..66e00e13149de
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
@@ -0,0 +1,246 @@
+//===-- AMDGPUEliminateAGPRToVGPRCopy.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file \brief TODO
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-eliminate-agpr-to-vgpr-copy"
+
+STATISTIC(NumEliminated, "Number of copies eliminated");
+
+namespace {
+
+class AMDGPUEliminateAGPRToVGPRCopyImpl {
+  const GCNSubtarget &ST;
+  const SIInstrInfo &TII;
+  const SIRegisterInfo &TRI;
+  MachineRegisterInfo &MRI;
+  VirtRegMap &VRM;
+  LiveRegMatrix &LRM;
+  LiveIntervals &LIS;
+
+public:
+  AMDGPUEliminateAGPRToVGPRCopyImpl(MachineFunction &MF, VirtRegMap &VRM,
+                                    LiveRegMatrix &LRM, LiveIntervals &LIS)
+      : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+        TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
+        LIS(LIS) {}
+
+  bool areAllUsesCompatible(Register Reg) const;
+
+  bool run(MachineFunction &MF) const;
+};
+
+bool AMDGPUEliminateAGPRToVGPRCopyImpl::areAllUsesCompatible(
+    Register Reg) const {
+  return all_of(MRI.use_operands(Reg), [&](const MachineOperand &MO) {
+    const MachineInstr &ParentMI = *MO.getParent();
+    if (!SIInstrInfo::isMFMA(ParentMI))
+      return false;
+    return &MO == TII.getNamedOperand(ParentMI, AMDGPU::OpName::src0) ||
+           &MO == TII.getNamedOperand(ParentMI, AMDGPU::OpName::src1);
+  });
+}
+
+bool AMDGPUEliminateAGPRToVGPRCopyImpl::run(MachineFunction &MF) const {
+  // This only applies on subtargets that have a configurable AGPR vs. VGPR
+  // allocation.
+  if (!ST.hasGFX90AInsts())
+    return false;
+
+  // Early exit if no AGPRs were assigned.
+  if (!LRM.isPhysRegUsed(AMDGPU::AGPR0))
+    return false;
+
+  bool MadeChange = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &CopyMI : make_early_inc_range(MBB)) {
+      // Find full copies...
+      if (!CopyMI.isFullCopy())
+        continue;
+
+      // ... whose destination was mapped to a VGPR or AGPR...
+      Register DstReg = CopyMI.getOperand(0).getReg();
+      if (!DstReg.isVirtual())
+        continue;
+      Register DstPhysReg = VRM.getPhys(DstReg);
+      if (!DstPhysReg)
+        continue;
+      const TargetRegisterClass *DstRC = TRI.getPhysRegBaseClass(DstPhysReg);
+      if (!TRI.hasVectorRegisters(DstRC) || TRI.hasSGPRs(DstRC))
+        continue;
+
+      // ... and whose source was mapped to an AGPR.
+      Register SrcReg = CopyMI.getOperand(1).getReg();
+      if (!SrcReg.isVirtual() || SrcReg == DstReg)
+        continue;
+      Register SrcPhysReg = VRM.getPhys(SrcReg);
+      if (!SrcPhysReg)
+        continue;
+      const TargetRegisterClass *SrcRC = TRI.getPhysRegBaseClass(SrcPhysReg);
+      if (!TRI.isAGPRClass(SrcRC))
+        continue;
+
+      bool DstIsAGPR = TRI.hasAGPRs(DstRC);
+
+      LLVM_DEBUG({
+        dbgs() << "AGPR->AVGPR copy: " << CopyMI;
+        dbgs() << "                  "
+               << printReg(DstReg, &TRI, CopyMI.getOperand(0).getSubReg(), &MRI)
+               << " <-> " << printReg(DstPhysReg, &TRI, 0, &MRI) << "\n";
+        dbgs() << "                  "
+               << printReg(SrcReg, &TRI, CopyMI.getOperand(1).getSubReg(), &MRI)
+               << " <-> " << printReg(SrcPhysReg, &TRI, 0, &MRI) << "\n";
+      });
+
+      LiveInterval &SrcLI = LIS.getInterval(SrcReg);
+      const VNInfo *SrcVNI = SrcLI.getVNInfoAt(LIS.getInstructionIndex(CopyMI));
+      assert(SrcVNI && "VNI must exist");
+
+      bool AllUsesCompatible =
+          all_of(MRI.use_operands(DstReg), [&](const MachineOperand &MO) {
+            // Destination's use must be src0/src1 operands of an MFMA or
+            // another copy.
+            const MachineInstr &UseMI = *MO.getParent();
+            if (!DstIsAGPR) {
+              if (SIInstrInfo::isMFMA(UseMI)) {
+                if (&MO != TII.getNamedOperand(UseMI, AMDGPU::OpName::src0) &&
+                    &MO != TII.getNamedOperand(UseMI, AMDGPU::OpName::src1)) {
+                  LLVM_DEBUG(dbgs() << "  Incompatible MFMA operand: " << UseMI);
+                  return false;
+                }
+              } else if (!UseMI.isFullCopy()){
+                LLVM_DEBUG(dbgs() << "  Incompatible user: " << UseMI);
+                return false;
+              }
+            } else {
+              LLVM_DEBUG(dbgs() << " Skipping user check (dst is AGPR)\n");
+            }
+
+            // Source must be available at use point.
+            const VNInfo *UseVNI =
+                SrcLI.getVNInfoAt(LIS.getInstructionIndex(UseMI));
+            if (SrcVNI != UseVNI) {
+              LLVM_DEBUG(dbgs() << "  AGPR no longer available at " << UseMI);
+            }
+            return true;
+          });
+      if (!AllUsesCompatible)
+        continue;
+
+      LLVM_DEBUG(dbgs() << "  -> Eliminated\n");
+      ++NumEliminated;
+
+      // Remove the copy's destination register.
+      MRI.replaceRegWith(DstReg, SrcReg);
+      LRM.unassign(LIS.getInterval(DstReg));
+      LIS.removeInterval(DstReg);
+
+      // Delete the copy instruction.
+      LIS.RemoveMachineInstrFromMaps(CopyMI);
+      CopyMI.eraseFromParent();
+
+      // Recompute the source register's interval.
+      // TODO: necessary? It is already live at all uses by construction.
+      LIS.removeInterval(SrcReg);
+      LIS.createAndComputeVirtRegInterval(SrcReg);
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+class AMDGPUEliminateAGPRToVGPRCopyLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUEliminateAGPRToVGPRCopyLegacy() : MachineFunctionPass(ID) {
+    initializeAMDGPUEliminateAGPRToVGPRCopyLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Eliminate AGPR-to-VGPR Copy";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervalsWrapperPass>();
+    AU.addRequired<VirtRegMapWrapperLegacy>();
+    AU.addRequired<LiveRegMatrixWrapperLegacy>();
+
+    AU.addPreserved<LiveIntervalsWrapperPass>();
+    AU.addPreserved<VirtRegMapWrapperLegacy>();
+    AU.addPreserved<LiveRegMatrixWrapperLegacy>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPUEliminateAGPRToVGPRCopyLegacy, DEBUG_TYPE,
+                      "AMDGPU Eliminate AGPR-to-VGPR Copy", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
+INITIALIZE_PASS_END(AMDGPUEliminateAGPRToVGPRCopyLegacy, DEBUG_TYPE,
+                    "AMDGPU Eliminate AGPR-to-VGPR Copy", false, false)
+
+char AMDGPUEliminateAGPRToVGPRCopyLegacy::ID = 0;
+
+char &llvm::AMDGPUEliminateAGPRToVGPRCopyLegacyID =
+    AMDGPUEliminateAGPRToVGPRCopyLegacy::ID;
+
+bool AMDGPUEliminateAGPRToVGPRCopyLegacy::runOnMachineFunction(
+    MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  auto &VRM = getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
+  auto &LRM = getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
+  auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+
+  AMDGPUEliminateAGPRToVGPRCopyImpl Impl(MF, VRM, LRM, LIS);
+  return Impl.run(MF);
+}
+
+PreservedAnalyses
+AMDGPUEliminateAGPRToVGPRCopyPass::run(MachineFunction &MF,
+                                       MachineFunctionAnalysisManager &MFAM) {
+  VirtRegMap &VRM = MFAM.getResult<VirtRegMapAnalysis>(MF);
+  LiveRegMatrix &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(MF);
+  LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
+
+  AMDGPUEliminateAGPRToVGPRCopyImpl Impl(MF, VRM, LRM, LIS);
+  if (!Impl.run(MF))
+    return PreservedAnalyses::all();
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index b6c6d927d0e89..34963826d16a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -106,6 +106,7 @@ MACHINE_FUNCTION_ANALYSIS("amdgpu-resource-usage", AMDGPUResourceUsageAnalysis(*
 #endif
 MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass())
 MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
+MACHINE_FUNCTION_PASS("amdgpu-eliminate-agpr-to-vgpr-copy", AMDGPUEliminateAGPRToVGPRCopyPass())
 MACHINE_FUNCTION_PASS("amdgpu-mark-last-scratch-load", AMDGPUMarkLastScratchLoadPass())
 MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass())
 MACHINE_FUNCTION_PASS("amdgpu-reserve-wwm-regs", AMDGPUReserveWWMRegsPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c1f17033d04a8..5512b15f4cb31 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -528,6 +528,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
   initializeAMDGPULowerKernelAttributesPass(*PR);
+  initializeAMDGPUEliminateAGPRToVGPRCopyLegacyPass(*PR);
   initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
   initializeAMDGPUPostLegalizerCombinerPass(*PR);
   initializeAMDGPUPreLegalizerCombinerPass(*PR);
@@ -1594,6 +1595,7 @@ bool GCNPassConfig::addPreRewrite() {
   if (EnableRegReassign)
     addPass(&GCNNSAReassignID);
 
+  addPass(&AMDGPUEliminateAGPRToVGPRCopyLegacyID);
   addPass(&AMDGPURewriteAGPRCopyMFMALegacyID);
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c466f9cf0f359..abc8020920e7d 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -53,6 +53,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
+  AMDGPUEliminateAGPRToVGPRCopy.cpp
   AMDGPUExportClustering.cpp
   AMDGPUExportKernelRuntimeHandles.cpp
   AMDGPUFrameLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 2a5c65278f7dc..ed865dc62592b 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -377,6 +377,7 @@
 ; GCN-O1-NEXT:        Live Register Matrix
 ; GCN-O1-NEXT:        Greedy Register Allocator
 ; GCN-O1-NEXT:        GCN NSA Reassign
+; GCN-O1-NEXT:        AMDGPU Eliminate AGPR-to-VGPR Copy
 ; GCN-O1-NEXT:        AMDGPU Rewrite AGPR-Copy-MFMA
 ; GCN-O1-NEXT:        Virtual Register Rewriter
 ; GCN-O1-NEXT:        AMDGPU Mark Last Scratch Load
@@ -689,6 +690,7 @@
 ; GCN-O1-OPTS-NEXT:        Live Register Matrix
 ; GCN-O1-OPTS-NEXT:        Greedy Register Allocator
 ; GCN-O1-OPTS-NEXT:        GCN NSA Reassign
+; GCN-O1-OPTS-NEXT:        AMDGPU Eliminate AGPR-to-VGPR Copy
 ; GCN-O1-OPTS-NEXT:        AMDGPU Rewrite AGPR-Copy-MFMA
 ; GCN-O1-OPTS-NEXT:        Virtual Register Rewriter
 ; GCN-O1-OPTS-NEXT:        AMDGPU Mark Last Scratch Load
@@ -1007,6 +1009,7 @@
 ; GCN-O2-NEXT:        Live Register Matrix
 ; GCN-O2-NEXT:        Greedy Register Allocator
 ; GCN-O2-NEXT:        GCN NSA Reassign
+; GCN-O2-NEXT:        AMDGPU Eliminate AGPR-to-VGPR Copy
 ; GCN-O2-NEXT:        AMDGPU Rewrite AGPR-Copy-MFMA
 ; GCN-O2-NEXT:        Virtual Register Rewriter
 ; GCN-O2-NEXT:        AMDGPU Mark Last Scratch Load
@@ -1338,6 +1341,7 @@
 ; GCN-O3-NEXT:        Live Register Matrix
 ; GCN-O3-NEXT:        Greedy Register Allocator
 ; GCN-O3-NEXT:        GCN NSA Reassign
+; GCN-O3-NEXT:        AMDGPU Eliminate AGPR-to-VGPR Copy
 ; GCN-O3-NEXT:        AMDGPU Rewrite AGPR-Copy-MFMA
 ; GCN-O3-NEXT:        Virtual Register Rewriter
 ; GCN-O3-NEXT:        AMDGPU Mark Last Scratch Load
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 6110b3101020a..fdf4dfd1eaefa 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -103,6 +103,9 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ; GFX90A-LABEL: test_mfma_loop_zeroinit:
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
@@ -134,14 +137,42 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT:    s_mov_b32 s0, 16
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:  .LBB0_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB0_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
@@ -149,7 +180,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 5
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -163,6 +194,9 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ; GFX942-LABEL: test_mfma_loop_zeroinit:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
@@ -194,14 +228,42 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB0_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB0_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
@@ -209,7 +271,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 4
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -338,45 +400,45 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
@@ -384,7 +446,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 5
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -399,45 +461,45 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
@@ -445,7 +507,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 4
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -566,8 +628,11 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX90A-LABEL: test_mfma_loop_non_splat:
 ; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 1.0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 1.0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
@@ -598,14 +663,42 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT:    s_mov_b32 s0, 16
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX90A-NEXT:  .LBB2_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
@@ -613,7 +706,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 5
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -626,8 +719,11 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ;
 ; GFX942-LABEL: test_mfma_loop_non_splat:
 ; GFX942:       ; %bb.0: ; %entry
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, 1.0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, 1.0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
@@ -658,14 +754,42 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_accvgpr_write_b32 a3, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-NEXT:  .LBB2_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
@@ -673,7 +797,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 4
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -865,75 +989,75 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x431a0000
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43190000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43180000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43170000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43160000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43150000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43140000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43130000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43120000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43110000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43100000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430f0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430e0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430d0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430c0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430b0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x430a0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43090000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43080000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43070000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43060000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43050000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43040000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43030000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43020000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43010000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x43000000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42fe0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42fc0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42fa0000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42f80000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42f60000
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:  .LBB3_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
@@ -941,7 +1065,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 5
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -957,75 +1081,75 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x431a0000
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43190000
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43180000
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43170000
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43160000
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43150000
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43140000
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43130000
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43120000
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43110000
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43100000
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430f0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430e0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430d0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430c0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430b0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x430a0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43090000
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43080000
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43070000
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43060000
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43050000
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43040000
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43030000
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43020000
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43010000
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43000000
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42fe0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42fc0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42fa0000
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42f80000
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42f60000
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB3_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
@@ -1033,7 +1157,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 4
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1156,45 +1280,45 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    s_mov_b32 s0, 16
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:  .LBB4_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB4_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
@@ -1202,7 +1326,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 5
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1217,45 +1341,45 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX942-NEXT:    s_mov_b32 s0, 16
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB4_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB4_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
@@ -1263,7 +1387,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 4
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1429,43 +1553,43 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX90A-NEXT:  .LBB5_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
@@ -1473,7 +1597,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 5
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1492,43 +1616,43 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
 ; GFX942-NEXT:  .LBB5_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
@@ -1536,7 +1660,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 4
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1695,11 +1819,11 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ; GFX90A:       ; %bb.0: ; %entry
 ; GFX90A-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX90A-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a27, 0
@@ -1728,18 +1852,46 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT:    s_mov_b32 s0, 16
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX90A-NEXT:  .LBB6_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1759,11 +1911,11 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a27, 0
@@ -1792,18 +1944,46 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
 ; GFX942-NEXT:    v_accvgpr_write_b32 a4, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a3, 0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT:    s_mov_b32 s0, 16
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GFX942-NEXT:  .LBB6_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2152,42 +2332,43 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    s_nop 7
 ; GFX90A-NEXT:    s_nop 7
 ; GFX90A-NEXT:    s_nop 2
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a7, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a8, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a9, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a10, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a11, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a12, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a13, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a14, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a15, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a17, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a18, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a19, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a20, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a21, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a22, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a23, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a24, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a25, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a26, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a27, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a28, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a29, a0
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a30, a0
 ; GFX90A-NEXT:    v_accvgpr_mov_b32 a31, a0
 ; GFX90A-NEXT:  .LBB8_1: ; %for.cond.preheader
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX90A-NEXT:    s_add_i32 s0, s0, -1
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX90A-NEXT:  ; %bb.2: ; %exit
@@ -2195,7 +2376,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    s_nop 5
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2216,42 +2397,43 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    s_nop 7
 ; GFX942-NEXT:    s_nop 7
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a8, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a9, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a10, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a11, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a12, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a13, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a14, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a15, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a16, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a17, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a18, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a19, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a20, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a21, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a22, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a23, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a24, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a25, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a26, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a27, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a28, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a29, a0
-; GFX942-NEXT:    v_accvgpr_mov_b32 a30, a0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a31, a0
 ; GFX942-NEXT:  .LBB8_1: ; %for.cond.preheader
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT:    ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT:    ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT:    ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT:    ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT:    ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT:    ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT:    ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT:    ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT:    ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT:    ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT:    ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT:    ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT:    ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT:    ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT:    ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT:    ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT:    ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT:    ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT:    ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT:    ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT:    ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT:    ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT:    ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT:    ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT:    ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT:    ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT:    ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT:    ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT:    ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT:    ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT:    ; kill: def $agpr0 killed $agpr0
 ; GFX942-NEXT:    s_add_i32 s0, s0, -1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX942-NEXT:  ; %bb.2: ; %exit
@@ -2259,7 +2441,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_nop 7
-; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    s_nop 4
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
 ; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index ea6449b99b516..aaca68f3d606f 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -31,6 +31,7 @@
 ; DEFAULT-NEXT: Live Register Matrix
 ; DEFAULT-NEXT: Greedy Register Allocator
 ; DEFAULT-NEXT: GCN NSA Reassign
+; DEFAULT-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
 ; DEFAULT-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
 ; DEFAULT-NEXT: Virtual Register Rewriter
 ; DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
@@ -78,6 +79,7 @@
 ; BASIC-DEFAULT-NEXT: Live Register Matrix
 ; BASIC-DEFAULT-NEXT: Greedy Register Allocator
 ; BASIC-DEFAULT-NEXT: GCN NSA Reassign
+; BASIC-DEFAULT-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
 ; BASIC-DEFAULT-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
 ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
 ; BASIC-DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
@@ -101,6 +103,7 @@
 ; DEFAULT-BASIC-NEXT: Live Register Matrix
 ; DEFAULT-BASIC-NEXT: Basic Register Allocator
 ; DEFAULT-BASIC-NEXT: GCN NSA Reassign
+; DEFAULT-BASIC-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
 ; DEFAULT-BASIC-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
 ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
 ; DEFAULT-BASIC-NEXT: AMDGPU Mark Last Scratch Load
@@ -130,6 +133,7 @@
 ; BASIC-BASIC-NEXT: Live Register Matrix
 ; BASIC-BASIC-NEXT: Basic Register Allocator
 ; BASIC-BASIC-NEXT: GCN NSA Reassign
+; BASIC-BASIC-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
 ; BASIC-BASIC-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
 ; BASIC-BASIC-NEXT: Virtual Register Rewriter
 ; BASIC-BASIC-NEXT: AMDGPU Mark Last Scratch Load
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index eb0d5465cacd9..b43c19f44614c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -77,26 +77,18 @@ define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg,
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_endpgm
 ; GFX90A-NEXT:  .LBB0_2: ; %use
-; GFX90A-NEXT:    s_nop 3
-; GFX90A-NEXT:    v_accvgpr_read_b32 v9, a7
-; GFX90A-NEXT:    v_accvgpr_read_b32 v8, a6
-; GFX90A-NEXT:    v_accvgpr_read_b32 v7, a5
-; GFX90A-NEXT:    v_accvgpr_read_b32 v6, a4
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    s_nop 2
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 5
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 1
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 2
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v6
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v7
-; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v8
-; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v9
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ;;#ASMSTART
@@ -155,26 +147,14 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT:    v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT:    v_accvgpr_read_b32 v2, a2
-; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v5, a3
-; GFX90A-NEXT:    v_accvgpr_read_b32 v4, a2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v4
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v5
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_endpgm
@@ -331,7 +311,6 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 2.0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0x41f80000
-; GFX90A-NEXT:    v_accvgpr_read_b32 v3, a0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 1.0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 4.0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v2
@@ -339,13 +318,12 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31]
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 7
-; GFX90A-NEXT:    s_nop 2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v3
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    global_store_dword v0, a0, s[2:3]
 ; GFX90A-NEXT:    s_endpgm
 bb:
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 3d11ce566207a..d907aee996390 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -141,6 +141,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUCodeGenPrepare.cpp",
     "AMDGPUCombinerHelper.cpp",
     "AMDGPUCtorDtorLowering.cpp",
+    "AMDGPUEliminateAGPRToVGPRCopy.cpp",
     "AMDGPUExportClustering.cpp",
     "AMDGPUExportKernelRuntimeHandles.cpp",
     "AMDGPUFrameLowering.cpp",

>From 11e81228f033fe171d08064277ca40259c268d77 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Fri, 15 Aug 2025 23:19:03 +0000
Subject: [PATCH 2/2] Format

---
 llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
index 66e00e13149de..5dc1430eaf771 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
@@ -129,10 +129,11 @@ bool AMDGPUEliminateAGPRToVGPRCopyImpl::run(MachineFunction &MF) const {
               if (SIInstrInfo::isMFMA(UseMI)) {
                 if (&MO != TII.getNamedOperand(UseMI, AMDGPU::OpName::src0) &&
                     &MO != TII.getNamedOperand(UseMI, AMDGPU::OpName::src1)) {
-                  LLVM_DEBUG(dbgs() << "  Incompatible MFMA operand: " << UseMI);
+                  LLVM_DEBUG(dbgs()
+                             << "  Incompatible MFMA operand: " << UseMI);
                   return false;
                 }
-              } else if (!UseMI.isFullCopy()){
+              } else if (!UseMI.isFullCopy()) {
                 LLVM_DEBUG(dbgs() << "  Incompatible user: " << UseMI);
                 return false;
               }



More information about the llvm-commits mailing list