[llvm] [AMDGPU] Post-RA AGPR copy elimination pass (PR #153901)
Lucas Ramirez via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 15 16:21:53 PDT 2025
https://github.com/lucas-rami updated https://github.com/llvm/llvm-project/pull/153901
>From 1df88d26b77b6968050c1dc4961b27ecb8fb6b04 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Fri, 15 Aug 2025 22:50:59 +0000
Subject: [PATCH 1/2] AGPR to VGPR copy elimination
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 11 +
.../AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp | 246 +++++
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 4 +
llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 946 +++++++++++-------
.../CodeGen/AMDGPU/sgpr-regalloc-flags.ll | 4 +
llvm/test/CodeGen/AMDGPU/spill-agpr.ll | 38 +-
.../secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 +
10 files changed, 842 insertions(+), 412 deletions(-)
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 007b481f84960..b83e62f4ea57c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -541,6 +541,17 @@ extern char &GCNRewritePartialRegUsesID;
void initializeAMDGPUWaitSGPRHazardsLegacyPass(PassRegistry &);
extern char &AMDGPUWaitSGPRHazardsLegacyID;
+class AMDGPUEliminateAGPRToVGPRCopyPass
+ : public PassInfoMixin<AMDGPUEliminateAGPRToVGPRCopyPass> {
+public:
+ AMDGPUEliminateAGPRToVGPRCopyPass() = default;
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+
+void initializeAMDGPUEliminateAGPRToVGPRCopyLegacyPass(PassRegistry &);
+extern char &AMDGPUEliminateAGPRToVGPRCopyLegacyID;
+
class AMDGPURewriteAGPRCopyMFMAPass
: public PassInfoMixin<AMDGPURewriteAGPRCopyMFMAPass> {
public:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
new file mode 100644
index 0000000000000..66e00e13149de
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
@@ -0,0 +1,246 @@
+//===-- AMDGPUEliminateAGPRToVGPRCopy.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file \brief TODO
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-eliminate-agpr-to-vgpr-copy"
+
+STATISTIC(NumEliminated, "Number of copies eliminated");
+
+namespace {
+
+class AMDGPUEliminateAGPRToVGPRCopyImpl {
+ const GCNSubtarget &ST;
+ const SIInstrInfo &TII;
+ const SIRegisterInfo &TRI;
+ MachineRegisterInfo &MRI;
+ VirtRegMap &VRM;
+ LiveRegMatrix &LRM;
+ LiveIntervals &LIS;
+
+public:
+ AMDGPUEliminateAGPRToVGPRCopyImpl(MachineFunction &MF, VirtRegMap &VRM,
+ LiveRegMatrix &LRM, LiveIntervals &LIS)
+ : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
+ TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
+ LIS(LIS) {}
+
+ bool areAllUsesCompatible(Register Reg) const;
+
+ bool run(MachineFunction &MF) const;
+};
+
+bool AMDGPUEliminateAGPRToVGPRCopyImpl::areAllUsesCompatible(
+ Register Reg) const {
+ return all_of(MRI.use_operands(Reg), [&](const MachineOperand &MO) {
+ const MachineInstr &ParentMI = *MO.getParent();
+ if (!SIInstrInfo::isMFMA(ParentMI))
+ return false;
+ return &MO == TII.getNamedOperand(ParentMI, AMDGPU::OpName::src0) ||
+ &MO == TII.getNamedOperand(ParentMI, AMDGPU::OpName::src1);
+ });
+}
+
+bool AMDGPUEliminateAGPRToVGPRCopyImpl::run(MachineFunction &MF) const {
+ // This only applies on subtargets that have a configurable AGPR vs. VGPR
+ // allocation.
+ if (!ST.hasGFX90AInsts())
+ return false;
+
+ // Early exit if no AGPRs were assigned.
+ if (!LRM.isPhysRegUsed(AMDGPU::AGPR0))
+ return false;
+
+ bool MadeChange = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &CopyMI : make_early_inc_range(MBB)) {
+ // Find full copies...
+ if (!CopyMI.isFullCopy())
+ continue;
+
+ // ... whose destination was mapped to a VGPR or AGPR...
+ Register DstReg = CopyMI.getOperand(0).getReg();
+ if (!DstReg.isVirtual())
+ continue;
+ Register DstPhysReg = VRM.getPhys(DstReg);
+ if (!DstPhysReg)
+ continue;
+ const TargetRegisterClass *DstRC = TRI.getPhysRegBaseClass(DstPhysReg);
+ if (!TRI.hasVectorRegisters(DstRC) || TRI.hasSGPRs(DstRC))
+ continue;
+
+ // ... and whose source was mapped to an AGPR.
+ Register SrcReg = CopyMI.getOperand(1).getReg();
+ if (!SrcReg.isVirtual() || SrcReg == DstReg)
+ continue;
+ Register SrcPhysReg = VRM.getPhys(SrcReg);
+ if (!SrcPhysReg)
+ continue;
+ const TargetRegisterClass *SrcRC = TRI.getPhysRegBaseClass(SrcPhysReg);
+ if (!TRI.isAGPRClass(SrcRC))
+ continue;
+
+ bool DstIsAGPR = TRI.hasAGPRs(DstRC);
+
+ LLVM_DEBUG({
+ dbgs() << "AGPR->AVGPR copy: " << CopyMI;
+ dbgs() << " "
+ << printReg(DstReg, &TRI, CopyMI.getOperand(0).getSubReg(), &MRI)
+ << " <-> " << printReg(DstPhysReg, &TRI, 0, &MRI) << "\n";
+ dbgs() << " "
+ << printReg(SrcReg, &TRI, CopyMI.getOperand(1).getSubReg(), &MRI)
+ << " <-> " << printReg(SrcPhysReg, &TRI, 0, &MRI) << "\n";
+ });
+
+ LiveInterval &SrcLI = LIS.getInterval(SrcReg);
+ const VNInfo *SrcVNI = SrcLI.getVNInfoAt(LIS.getInstructionIndex(CopyMI));
+ assert(SrcVNI && "VNI must exist");
+
+ bool AllUsesCompatible =
+ all_of(MRI.use_operands(DstReg), [&](const MachineOperand &MO) {
+ // Destination's use must be src0/src1 operands of an MFMA or
+ // another copy.
+ const MachineInstr &UseMI = *MO.getParent();
+ if (!DstIsAGPR) {
+ if (SIInstrInfo::isMFMA(UseMI)) {
+ if (&MO != TII.getNamedOperand(UseMI, AMDGPU::OpName::src0) &&
+ &MO != TII.getNamedOperand(UseMI, AMDGPU::OpName::src1)) {
+ LLVM_DEBUG(dbgs() << " Incompatible MFMA operand: " << UseMI);
+ return false;
+ }
+ } else if (!UseMI.isFullCopy()){
+ LLVM_DEBUG(dbgs() << " Incompatible user: " << UseMI);
+ return false;
+ }
+ } else {
+ LLVM_DEBUG(dbgs() << " Skipping user check (dst is AGPR)\n");
+ }
+
+ // Source must be available at use point.
+ const VNInfo *UseVNI =
+ SrcLI.getVNInfoAt(LIS.getInstructionIndex(UseMI));
+ if (SrcVNI != UseVNI) {
+ LLVM_DEBUG(dbgs() << " AGPR no longer available at " << UseMI);
+ }
+ return true;
+ });
+ if (!AllUsesCompatible)
+ continue;
+
+ LLVM_DEBUG(dbgs() << " -> Eliminated\n");
+ ++NumEliminated;
+
+ // Remove the copy's destination register.
+ MRI.replaceRegWith(DstReg, SrcReg);
+ LRM.unassign(LIS.getInterval(DstReg));
+ LIS.removeInterval(DstReg);
+
+ // Delete the copy instruction.
+ LIS.RemoveMachineInstrFromMaps(CopyMI);
+ CopyMI.eraseFromParent();
+
+ // Recompute the source register's interval.
+ // TODO: necessary? It is already live at all uses by construction.
+ LIS.removeInterval(SrcReg);
+ LIS.createAndComputeVirtRegInterval(SrcReg);
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+class AMDGPUEliminateAGPRToVGPRCopyLegacy : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUEliminateAGPRToVGPRCopyLegacy() : MachineFunctionPass(ID) {
+ initializeAMDGPUEliminateAGPRToVGPRCopyLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Eliminate AGPR-to-VGPR Copy";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervalsWrapperPass>();
+ AU.addRequired<VirtRegMapWrapperLegacy>();
+ AU.addRequired<LiveRegMatrixWrapperLegacy>();
+
+ AU.addPreserved<LiveIntervalsWrapperPass>();
+ AU.addPreserved<VirtRegMapWrapperLegacy>();
+ AU.addPreserved<LiveRegMatrixWrapperLegacy>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPUEliminateAGPRToVGPRCopyLegacy, DEBUG_TYPE,
+ "AMDGPU Eliminate AGPR-to-VGPR Copy", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
+INITIALIZE_PASS_END(AMDGPUEliminateAGPRToVGPRCopyLegacy, DEBUG_TYPE,
+ "AMDGPU Eliminate AGPR-to-VGPR Copy", false, false)
+
+char AMDGPUEliminateAGPRToVGPRCopyLegacy::ID = 0;
+
+char &llvm::AMDGPUEliminateAGPRToVGPRCopyLegacyID =
+ AMDGPUEliminateAGPRToVGPRCopyLegacy::ID;
+
+bool AMDGPUEliminateAGPRToVGPRCopyLegacy::runOnMachineFunction(
+ MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ auto &VRM = getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
+ auto &LRM = getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
+ auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+
+ AMDGPUEliminateAGPRToVGPRCopyImpl Impl(MF, VRM, LRM, LIS);
+ return Impl.run(MF);
+}
+
+PreservedAnalyses
+AMDGPUEliminateAGPRToVGPRCopyPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ VirtRegMap &VRM = MFAM.getResult<VirtRegMapAnalysis>(MF);
+ LiveRegMatrix &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(MF);
+ LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
+
+ AMDGPUEliminateAGPRToVGPRCopyImpl Impl(MF, VRM, LRM, LIS);
+ if (!Impl.run(MF))
+ return PreservedAnalyses::all();
+ auto PA = getMachineFunctionPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index b6c6d927d0e89..34963826d16a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -106,6 +106,7 @@ MACHINE_FUNCTION_ANALYSIS("amdgpu-resource-usage", AMDGPUResourceUsageAnalysis(*
#endif
MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass())
MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
+MACHINE_FUNCTION_PASS("amdgpu-eliminate-agpr-to-vgpr-copy", AMDGPUEliminateAGPRToVGPRCopyPass())
MACHINE_FUNCTION_PASS("amdgpu-mark-last-scratch-load", AMDGPUMarkLastScratchLoadPass())
MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass())
MACHINE_FUNCTION_PASS("amdgpu-reserve-wwm-regs", AMDGPUReserveWWMRegsPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c1f17033d04a8..5512b15f4cb31 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -528,6 +528,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);
+ initializeAMDGPUEliminateAGPRToVGPRCopyLegacyPass(*PR);
initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
@@ -1594,6 +1595,7 @@ bool GCNPassConfig::addPreRewrite() {
if (EnableRegReassign)
addPass(&GCNNSAReassignID);
+ addPass(&AMDGPUEliminateAGPRToVGPRCopyLegacyID);
addPass(&AMDGPURewriteAGPRCopyMFMALegacyID);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c466f9cf0f359..abc8020920e7d 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -53,6 +53,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp
AMDGPUCtorDtorLowering.cpp
+ AMDGPUEliminateAGPRToVGPRCopy.cpp
AMDGPUExportClustering.cpp
AMDGPUExportKernelRuntimeHandles.cpp
AMDGPUFrameLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 2a5c65278f7dc..ed865dc62592b 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -377,6 +377,7 @@
; GCN-O1-NEXT: Live Register Matrix
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: GCN NSA Reassign
+; GCN-O1-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; GCN-O1-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; GCN-O1-NEXT: Virtual Register Rewriter
; GCN-O1-NEXT: AMDGPU Mark Last Scratch Load
@@ -689,6 +690,7 @@
; GCN-O1-OPTS-NEXT: Live Register Matrix
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: GCN NSA Reassign
+; GCN-O1-OPTS-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; GCN-O1-OPTS-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
; GCN-O1-OPTS-NEXT: AMDGPU Mark Last Scratch Load
@@ -1007,6 +1009,7 @@
; GCN-O2-NEXT: Live Register Matrix
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: GCN NSA Reassign
+; GCN-O2-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; GCN-O2-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; GCN-O2-NEXT: Virtual Register Rewriter
; GCN-O2-NEXT: AMDGPU Mark Last Scratch Load
@@ -1338,6 +1341,7 @@
; GCN-O3-NEXT: Live Register Matrix
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: GCN NSA Reassign
+; GCN-O3-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; GCN-O3-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; GCN-O3-NEXT: Virtual Register Rewriter
; GCN-O3-NEXT: AMDGPU Mark Last Scratch Load
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 6110b3101020a..fdf4dfd1eaefa 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -103,6 +103,9 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX90A-LABEL: test_mfma_loop_zeroinit:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
@@ -134,14 +137,42 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -149,7 +180,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -163,6 +194,9 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX942-LABEL: test_mfma_loop_zeroinit:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
@@ -194,14 +228,42 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB0_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -209,7 +271,7 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -338,45 +400,45 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -384,7 +446,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -399,45 +461,45 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB1_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -445,7 +507,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -566,8 +628,11 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
;
; GFX90A-LABEL: test_mfma_loop_non_splat:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
@@ -598,14 +663,42 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -613,7 +706,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -626,8 +719,11 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
;
; GFX942-LABEL: test_mfma_loop_non_splat:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
@@ -658,14 +754,42 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB2_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -673,7 +797,7 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -865,75 +989,75 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43190000
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43180000
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43170000
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43160000
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43150000
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43140000
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43130000
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43120000
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43110000
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43100000
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430f0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430e0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430d0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430c0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430b0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430a0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43090000
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43080000
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43070000
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43060000
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43050000
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43040000
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43030000
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43020000
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43010000
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43000000
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fe0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fc0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fa0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f80000
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -941,7 +1065,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -957,75 +1081,75 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000
; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43190000
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43180000
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43170000
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43160000
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43150000
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43140000
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43130000
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43120000
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43110000
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43100000
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x430f0000
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x430e0000
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x430d0000
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x430c0000
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x430b0000
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x430a0000
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43090000
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43080000
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43070000
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43060000
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43050000
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43040000
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43030000
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43020000
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43010000
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x43000000
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fe0000
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fc0000
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fa0000
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f80000
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB3_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -1033,7 +1157,7 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1156,45 +1280,45 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -1202,7 +1326,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1217,45 +1341,45 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB4_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -1263,7 +1387,7 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1429,43 +1553,43 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s1
; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -1473,7 +1597,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1492,43 +1616,43 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, s1
; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB5_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -1536,7 +1660,7 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1695,11 +1819,11 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s1
; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1728,18 +1852,46 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1
; GFX90A-NEXT: ; %bb.2: ; %exit
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
@@ -1759,11 +1911,11 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s1
; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
@@ -1792,18 +1944,46 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB6_1
; GFX942-NEXT: ; %bb.2: ; %exit
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
@@ -2152,42 +2332,43 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 7
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0
; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX90A-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX90A-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX90A-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX90A-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX90A-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX90A-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX90A-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX90A-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX90A-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX90A-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX90A-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX90A-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX90A-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX90A-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX90A-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX90A-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX90A-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX90A-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX90A-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX90A-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX90A-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX90A-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX90A-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX90A-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX90A-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX90A-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX90A-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX90A-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX90A-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX90A-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1
; GFX90A-NEXT: ; %bb.2: ; %exit
@@ -2195,7 +2376,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: s_nop 5
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2216,42 +2397,43 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 7
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0
; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: ; kill: def $agpr30 killed $agpr30
+; GFX942-NEXT: ; kill: def $agpr29 killed $agpr29
+; GFX942-NEXT: ; kill: def $agpr28 killed $agpr28
+; GFX942-NEXT: ; kill: def $agpr27 killed $agpr27
+; GFX942-NEXT: ; kill: def $agpr26 killed $agpr26
+; GFX942-NEXT: ; kill: def $agpr25 killed $agpr25
+; GFX942-NEXT: ; kill: def $agpr24 killed $agpr24
+; GFX942-NEXT: ; kill: def $agpr23 killed $agpr23
+; GFX942-NEXT: ; kill: def $agpr22 killed $agpr22
+; GFX942-NEXT: ; kill: def $agpr21 killed $agpr21
+; GFX942-NEXT: ; kill: def $agpr20 killed $agpr20
+; GFX942-NEXT: ; kill: def $agpr19 killed $agpr19
+; GFX942-NEXT: ; kill: def $agpr18 killed $agpr18
+; GFX942-NEXT: ; kill: def $agpr17 killed $agpr17
+; GFX942-NEXT: ; kill: def $agpr16 killed $agpr16
+; GFX942-NEXT: ; kill: def $agpr15 killed $agpr15
+; GFX942-NEXT: ; kill: def $agpr14 killed $agpr14
+; GFX942-NEXT: ; kill: def $agpr13 killed $agpr13
+; GFX942-NEXT: ; kill: def $agpr12 killed $agpr12
+; GFX942-NEXT: ; kill: def $agpr11 killed $agpr11
+; GFX942-NEXT: ; kill: def $agpr10 killed $agpr10
+; GFX942-NEXT: ; kill: def $agpr9 killed $agpr9
+; GFX942-NEXT: ; kill: def $agpr8 killed $agpr8
+; GFX942-NEXT: ; kill: def $agpr7 killed $agpr7
+; GFX942-NEXT: ; kill: def $agpr6 killed $agpr6
+; GFX942-NEXT: ; kill: def $agpr5 killed $agpr5
+; GFX942-NEXT: ; kill: def $agpr4 killed $agpr4
+; GFX942-NEXT: ; kill: def $agpr3 killed $agpr3
+; GFX942-NEXT: ; kill: def $agpr2 killed $agpr2
+; GFX942-NEXT: ; kill: def $agpr1 killed $agpr1
+; GFX942-NEXT: ; kill: def $agpr0 killed $agpr0
; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB8_1
; GFX942-NEXT: ; %bb.2: ; %exit
@@ -2259,7 +2441,7 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: s_nop 4
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index ea6449b99b516..aaca68f3d606f 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -31,6 +31,7 @@
; DEFAULT-NEXT: Live Register Matrix
; DEFAULT-NEXT: Greedy Register Allocator
; DEFAULT-NEXT: GCN NSA Reassign
+; DEFAULT-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; DEFAULT-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; DEFAULT-NEXT: Virtual Register Rewriter
; DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
@@ -78,6 +79,7 @@
; BASIC-DEFAULT-NEXT: Live Register Matrix
; BASIC-DEFAULT-NEXT: Greedy Register Allocator
; BASIC-DEFAULT-NEXT: GCN NSA Reassign
+; BASIC-DEFAULT-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; BASIC-DEFAULT-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
; BASIC-DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
@@ -101,6 +103,7 @@
; DEFAULT-BASIC-NEXT: Live Register Matrix
; DEFAULT-BASIC-NEXT: Basic Register Allocator
; DEFAULT-BASIC-NEXT: GCN NSA Reassign
+; DEFAULT-BASIC-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; DEFAULT-BASIC-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
; DEFAULT-BASIC-NEXT: AMDGPU Mark Last Scratch Load
@@ -130,6 +133,7 @@
; BASIC-BASIC-NEXT: Live Register Matrix
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: GCN NSA Reassign
+; BASIC-BASIC-NEXT: AMDGPU Eliminate AGPR-to-VGPR Copy
; BASIC-BASIC-NEXT: AMDGPU Rewrite AGPR-Copy-MFMA
; BASIC-BASIC-NEXT: Virtual Register Rewriter
; BASIC-BASIC-NEXT: AMDGPU Mark Last Scratch Load
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index eb0d5465cacd9..b43c19f44614c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -77,26 +77,18 @@ define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg,
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_endpgm
; GFX90A-NEXT: .LBB0_2: ; %use
-; GFX90A-NEXT: s_nop 3
-; GFX90A-NEXT: v_accvgpr_read_b32 v9, a7
-; GFX90A-NEXT: v_accvgpr_read_b32 v8, a6
-; GFX90A-NEXT: v_accvgpr_read_b32 v7, a5
-; GFX90A-NEXT: v_accvgpr_read_b32 v6, a4
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: s_nop 2
; GFX90A-NEXT: v_accvgpr_write_b32 a4, 4
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_accvgpr_write_b32 a8, 5
; GFX90A-NEXT: v_accvgpr_write_b32 a9, 1
; GFX90A-NEXT: v_accvgpr_write_b32 a10, 2
; GFX90A-NEXT: v_accvgpr_write_b32 a11, 3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v6
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v7
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v8
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v9
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, v0
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
@@ -155,26 +147,14 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a3
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a2
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v5
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_endpgm
@@ -331,7 +311,6 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0
; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2
; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41f80000
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0
; GFX90A-NEXT: v_accvgpr_write_b32 a3, 4.0
; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2
@@ -339,13 +318,12 @@ define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31]
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 1
; GFX90A-NEXT: global_store_dword v0, a0, s[2:3]
; GFX90A-NEXT: s_endpgm
bb:
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 3d11ce566207a..d907aee996390 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -141,6 +141,7 @@ static_library("LLVMAMDGPUCodeGen") {
"AMDGPUCodeGenPrepare.cpp",
"AMDGPUCombinerHelper.cpp",
"AMDGPUCtorDtorLowering.cpp",
+ "AMDGPUEliminateAGPRToVGPRCopy.cpp",
"AMDGPUExportClustering.cpp",
"AMDGPUExportKernelRuntimeHandles.cpp",
"AMDGPUFrameLowering.cpp",
>From 11e81228f033fe171d08064277ca40259c268d77 Mon Sep 17 00:00:00 2001
From: Lucas Ramirez <lucas.rami at proton.me>
Date: Fri, 15 Aug 2025 23:19:03 +0000
Subject: [PATCH 2/2] Format
---
llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp b/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
index 66e00e13149de..5dc1430eaf771 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUEliminateAGPRToVGPRCopy.cpp
@@ -129,10 +129,11 @@ bool AMDGPUEliminateAGPRToVGPRCopyImpl::run(MachineFunction &MF) const {
if (SIInstrInfo::isMFMA(UseMI)) {
if (&MO != TII.getNamedOperand(UseMI, AMDGPU::OpName::src0) &&
&MO != TII.getNamedOperand(UseMI, AMDGPU::OpName::src1)) {
- LLVM_DEBUG(dbgs() << " Incompatible MFMA operand: " << UseMI);
+ LLVM_DEBUG(dbgs()
+ << " Incompatible MFMA operand: " << UseMI);
return false;
}
- } else if (!UseMI.isFullCopy()){
+ } else if (!UseMI.isFullCopy()) {
LLVM_DEBUG(dbgs() << " Incompatible user: " << UseMI);
return false;
}
More information about the llvm-commits
mailing list