[llvm] [AMDGPU] Register allocation anti-hints to reduce MFMA hazard NOPs (PR #156943)

Syadus Sefat via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 19 11:19:57 PST 2025


https://github.com/mssefat updated https://github.com/llvm/llvm-project/pull/156943

>From c77788ff271dd58f3f50e2f097c478c96ab3ef0d Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Thu, 4 Sep 2025 12:31:49 -0400
Subject: [PATCH 01/20] [AMDGPU] Improve register allocation to reduce MFMA
 hazard NOPs

rebased
---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   |   94 ++
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |   14 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |   32 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |    4 +-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir |   40 +
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir |  542 +++----
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll |  112 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll |  270 ++--
 ...amdgcn.mfma.hint.hazard.barrier.gfx942.mir | 1292 +++++++++++++++++
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  |  146 +-
 ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll |  159 +-
 .../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll    |   12 +-
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       |   19 +-
 .../unspill-vgpr-after-rewrite-vgpr-mfma.ll   |   33 +-
 14 files changed, 2129 insertions(+), 640 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9485e4d..6d2b10bdb5804 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -34,6 +34,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -43,6 +44,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
 
+static cl::opt<bool> EnableRegisterAvoidListForMFMARegs(
+    "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden,
+    cl::desc("Enable Register Avoidance for "
+             "MFMA in GCNPreRAOptimizations stage."),
+    cl::init(true));
+
 namespace {
 
 class GCNPreRAOptimizationsImpl {
@@ -248,6 +255,93 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
 
   bool Changed = false;
 
+  // Single pass implementation
+  if (EnableRegisterAvoidListForMFMARegs && ST.hasMAIInsts()) {
+    // Max lookback window for RAW or WAW hazard
+    constexpr unsigned MaxLookbackWindow = 19;
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    for (const MachineBasicBlock &MBB : MF) {
+
+      SmallVector<std::pair<SlotIndex, SmallVector<Register, 4>>, 16>
+          RecentMFMAs;
+      for (const MachineInstr &MI : MBB) {
+        if (MI.isDebugInstr())
+          continue;
+        const SlotIndex CurrentSlot = LIS->getInstructionIndex(MI).getRegSlot();
+        // Handle MFMA instructions
+        if (SIInstrInfo::isMFMA(MI)) {
+          SmallVector<Register, 4> MFMARegisters;
+          auto collectMFMARegister = [&](unsigned OpIdx) {
+            if (OpIdx >= MI.getNumOperands())
+              return;
+
+            const MachineOperand &MO = MI.getOperand(OpIdx);
+            if (MO.isReg() && MO.getReg().isVirtual())
+              MFMARegisters.push_back(MO.getReg());
+          };
+          // Only collect Matrix C (operand 3) and destination (operand 0)
+          // registers
+          collectMFMARegister(0);
+          collectMFMARegister(3);
+
+          if (!MFMARegisters.empty()) {
+            RecentMFMAs.emplace_back(CurrentSlot, std::move(MFMARegisters));
+            // Maintain window
+            if (RecentMFMAs.size() > MaxLookbackWindow)
+              RecentMFMAs.erase(RecentMFMAs.begin());
+          }
+          continue;
+        }
+        bool ShouldCheckReuse = MI.mayLoad() || MI.mayStore() || MI.isCopy() ||
+                                SIInstrInfo::isVALU(MI);
+        // Skip non-relevant instructions, or skip until at least one MFMA is
+        // encountered
+        if (!ShouldCheckReuse || RecentMFMAs.empty())
+          continue;
+
+        // Process operands that might reuse MFMA registers
+        for (const MachineOperand &MO : MI.operands()) {
+          if (!MO.isReg() || !MO.getReg().isVirtual())
+            continue;
+
+          const Register CandidateReg = MO.getReg();
+          const TargetRegisterClass *CandidateRC =
+              MRI->getRegClass(CandidateReg);
+
+          // Only process VGPR registers
+          if (!TRI->isVGPRClass(CandidateRC))
+            continue;
+
+          for (auto It = RecentMFMAs.rbegin(); It != RecentMFMAs.rend(); ++It) {
+            const SmallVector<Register, 4> &MFMARegs = It->second;
+            for (Register MFMAReg : MFMARegs) {
+              // Verify register class compatibility
+              const TargetRegisterClass *MFMARC = MRI->getRegClass(MFMAReg);
+              if (!TRI->hasVGPRs(MFMARC))
+                continue;
+
+              // Check if MFMA register is dead at current instruction
+              const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg);
+              if (!MFMAInterval.liveAt(CurrentSlot)) {
+
+                // Add bidirectional avoidance hint
+                MFI->addRegisterToAvoid(CandidateReg, MFMAReg);
+                MFI->addRegisterToAvoid(MFMAReg, CandidateReg);
+
+                // Set hint if we found registers to avoid
+                MRI->setRegAllocationHint(
+                    MFMAReg, AMDGPURI::HasRegisterAvoidanceList, Register());
+                MRI->setRegAllocationHint(CandidateReg,
+                                          AMDGPURI::HasRegisterAvoidanceList,
+                                          Register());
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
   for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
     if (!LIS->hasInterval(Reg))
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 2c1a13c345aac..46844d5c3fb87 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1220,6 +1220,20 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
 
   AMDGPU::ClusterDimsAttr getClusterDims() const { return ClusterDims; }
+
+  // Map of registers to avoid for a given register
+  DenseMap<Register, SmallVector<Register, 8>> RegisterAvoidanceMap;
+
+  void addRegisterToAvoid(Register VirtReg, Register AvoidReg) {
+    RegisterAvoidanceMap[VirtReg].push_back(AvoidReg);
+  }
+
+  ArrayRef<Register> getRegistersToAvoid(Register VirtReg) const {
+    auto It = RegisterAvoidanceMap.find(VirtReg);
+    if (It != RegisterAvoidanceMap.end())
+      return It->second;
+    return ArrayRef<Register>();
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ebd2e7ecf249e..819a6c24ecade 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3831,6 +3831,38 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
     }
     return false;
   }
+  case AMDGPURI::HasRegisterAvoidanceList: {
+    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    ArrayRef<Register> AvoidRegs = MFI->getRegistersToAvoid(VirtReg);
+
+    if (AvoidRegs.empty())
+      return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+                                                       MF, VRM);
+    // Collect physical registers to avoid
+    SmallSet<MCPhysReg, 32> AvoidPhysRegs;
+    for (Register AvoidReg : AvoidRegs) {
+      if (VRM && VRM->hasPhys(AvoidReg)) {
+        // Virtual register already mapped - try to avoid its physical register
+        MCPhysReg AvoidPhys = VRM->getPhys(AvoidReg);
+        for (MCRegAliasIterator AI(AvoidPhys, this, true); AI.isValid(); ++AI)
+          AvoidPhysRegs.insert(*AI);
+      }
+    }
+
+    if (AvoidPhysRegs.empty()) {
+      // No physical registers added yet - use default order
+      return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+                                                       MF, VRM);
+    }
+
+    // Prioritize registers that don't conflict with avoided registers
+    for (MCPhysReg PhysReg : Order) {
+      if (!AvoidPhysRegs.count(PhysReg) && !MRI.isReserved(PhysReg))
+        Hints.push_back(PhysReg);
+    }
+
+    return false;
+  }
   default:
     return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
                                                      VRM);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7b91ba7bc581f..ed0c580abc952 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -31,9 +31,11 @@ class RegisterBank;
 struct SGPRSpillBuilder;
 
 /// Register allocation hint types. Helps eliminate unneeded COPY with True16
+/// HasRegisterAvoidanceList helps with minimizing usage of conflicting physical
+/// registers
 namespace AMDGPURI {
 
-enum { Size16 = 1, Size32 = 2 };
+enum { Size16 = 1, Size32 = 2, HasRegisterAvoidanceList = 3 };
 
 } // end namespace AMDGPURI
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index 689d1472d6010..8fbfe2e591dfe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -6,6 +6,23 @@
   define amdgpu_kernel void @largeInterleave() #0 { ret void }
   ; GCN-LABEL: largeInterleave:
   ; GCN:       ; %bb.0:
+  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; GCN-NEXT:    ; implicit-def: $vgpr0
+  ; GCN-NEXT:    ; implicit-def: $vgpr2
+  ; GCN-NEXT:    ; implicit-def: $vgpr1
+  ; GCN-NEXT:    ; implicit-def: $vgpr8
+  ; GCN-NEXT:    ; implicit-def: $vgpr94
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ; implicit-def: $vgpr106
+  ; GCN-NEXT:    ; implicit-def: $vgpr132
+  ; GCN-NEXT:    ; implicit-def: $vgpr112
+  ; GCN-NEXT:    ; implicit-def: $vgpr113
+  ; GCN-NEXT:    ; implicit-def: $vgpr114
+  ; GCN-NEXT:    ; implicit-def: $vgpr115
+  ; GCN-NEXT:    ; implicit-def: $vgpr133
+  ; GCN-NEXT:    ; implicit-def: $vgpr139
+  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
+  ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    ; implicit-def: $vgpr16
   ; GCN-NEXT:    ; implicit-def: $vgpr25
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -202,30 +219,45 @@
   ; GCN-NEXT:    ds_write_b128 v230, v[152:155]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
   ; GCN-NEXT:    ds_write_b128 v230, v[160:163] offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
+  ; GCN-NEXT:    v_add_u32_e32 v73, v132, v112
+  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79]
   ; GCN-NEXT:    buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; kill: killed $vgpr72
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v113
+  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v73, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v114
+  ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v115
+  ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; kill: killed $vgpr73
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79]
   ; GCN-NEXT:    v_perm_b32 v238, v162, v160, s5
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127]
@@ -235,6 +267,14 @@
   ; GCN-NEXT:    ds_read_b128 v[160:163], v213
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
+  ; GCN-NEXT:    ; implicit-def: $sgpr8
+  ; GCN-NEXT:    ; implicit-def: $vgpr112
+  ; GCN-NEXT:    ; implicit-def: $vgpr113
+  ; GCN-NEXT:    ; implicit-def: $vgpr114
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
   ; GCN-NEXT:    v_perm_b32 v239, v174, v172, s5
   ; GCN-NEXT:    v_perm_b32 v241, v174, v172, s7
   ; GCN-NEXT:    v_perm_b32 v243, v175, v173, s5
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0887fdf0844b0..be97a1e82fcf2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -10,25 +10,24 @@
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
   ; GCN-NEXT:    v_readfirstlane_b32 s20, v2
   ; GCN-NEXT:    ; implicit-def: $sgpr4
-  ; GCN-NEXT:    ; implicit-def: $vgpr3
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
   ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:    ; implicit-def: $vgpr50
+  ; GCN-NEXT:    ; implicit-def: $vgpr76
   ; GCN-NEXT:    ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; implicit-def: $vgpr49
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
-  ; GCN-NEXT:    ; implicit-def: $vgpr51
-  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
-  ; GCN-NEXT:    ; implicit-def: $vgpr76
+  ; GCN-NEXT:    ; implicit-def: $vgpr50
   ; GCN-NEXT:    ; implicit-def: $vgpr77
   ; GCN-NEXT:    ; implicit-def: $vgpr78
   ; GCN-NEXT:    ; implicit-def: $vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr80
-  ; GCN-NEXT:    ; implicit-def: $vgpr91
+  ; GCN-NEXT:    ; implicit-def: $vgpr81
+  ; GCN-NEXT:    ; implicit-def: $vgpr103
   ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v3
+  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v64
   ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
   ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -36,8 +35,9 @@
   ; GCN-NEXT:    s_lshl_b32 s4, s20, 7
   ; GCN-NEXT:    ; implicit-def: $vgpr5
   ; GCN-NEXT:    v_add_lshl_u32 v48, v5, s4, 1
-  ; GCN-NEXT:    v_add_u32_e32 v76, s20, v76
-  ; GCN-NEXT:    v_and_b32_e32 v76, 0x1fffffff, v76
+  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
+  ; GCN-NEXT:    v_add_u32_e32 v77, s20, v77
+  ; GCN-NEXT:    v_and_b32_e32 v77, 0x1fffffff, v77
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b128 v48, v[0:3]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -48,8 +48,8 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   ; GCN-NEXT:    ; implicit-def: $sgpr6
-  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v50
-  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v50
+  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v76
+  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v76
   ; GCN-NEXT:    buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -68,22 +68,22 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
   ; GCN-NEXT:    ; kill: killed $vgpr1
   ; GCN-NEXT:    ; kill: killed $vgpr0
-  ; GCN-NEXT:    v_mul_lo_u32 v76, v76, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v76, v77, v76, 1
-  ; GCN-NEXT:    v_lshl_add_u32 v77, v78, 1, v76
-  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    v_mul_lo_u32 v77, v77, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v77, v78, v77, 1
   ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
   ; GCN-NEXT:    ; implicit-def: $sgpr2
   ; GCN-NEXT:    ; implicit-def: $sgpr3
-  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
+  ; GCN-NEXT:    v_lshl_add_u32 v80, v81, 1, v79
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v51
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v50
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v51 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v50 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
@@ -107,20 +107,20 @@
   ; GCN-NEXT:    ds_read_b128 v[40:43], v49 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v51
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v50
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
   ; GCN-NEXT:    ; implicit-def: $vgpr32
   ; GCN-NEXT:    ; implicit-def: $vgpr33
-  ; GCN-NEXT:    v_add_u32_e32 v82, v32, v50
-  ; GCN-NEXT:    v_add_u32_e32 v83, v33, v50
-  ; GCN-NEXT:    ; kill: killed $vgpr82
+  ; GCN-NEXT:    v_add_u32_e32 v83, v32, v76
+  ; GCN-NEXT:    v_add_u32_e32 v76, v33, v76
   ; GCN-NEXT:    ; kill: killed $vgpr83
+  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[66:69], v51 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[66:69], v50 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
@@ -131,20 +131,20 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
   ; GCN-NEXT:    ; implicit-def: $vgpr66
   ; GCN-NEXT:    ; implicit-def: $vgpr67
-  ; GCN-NEXT:    v_max_f32_e32 v81, v67, v67
+  ; GCN-NEXT:    v_max_f32_e32 v82, v67, v67
   ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
   ; GCN-NEXT:    v_perm_b32 v70, v74, v72, s2
   ; GCN-NEXT:    v_perm_b32 v71, v74, v72, s3
   ; GCN-NEXT:    v_perm_b32 v72, v75, v73, s2
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v76, v70
+  ; GCN-NEXT:    ds_write_b32 v77, v70
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v77, v71
+  ; GCN-NEXT:    ds_write_b32 v78, v71
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v72
+  ; GCN-NEXT:    ds_write_b32 v79, v72
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v20
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
   ; GCN-NEXT:    v_mul_f32_e32 v64, s4, v16
@@ -152,11 +152,11 @@
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
   ; GCN-NEXT:    v_max3_f32 v64, v64, s5, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v21
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v22
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v23
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v24
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v25
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
@@ -166,12 +166,12 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v28
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v29
   ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v30
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v31
   ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v0
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v1
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v80, v84
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v81, v84
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v2
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v3
   ; GCN-NEXT:    v_max3_f32 v64, v64, v85, v86
@@ -179,315 +179,315 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v5
   ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v65
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v7
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v7
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v8
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v9
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v10
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v11
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v12
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v13
   ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
-  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
   ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v68
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
   ; GCN-NEXT:    v_perm_b32 v68, v75, v73, s3
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v68
-  ; GCN-NEXT:    ; implicit-def: $vgpr84
-  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
-  ; GCN-NEXT:    v_max_f32_e32 v70, v64, v65
+  ; GCN-NEXT:    ds_write_b32 v80, v68
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[70:71], v76, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_bpermute_b32 v71, v66, v70
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; implicit-def: $vgpr87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v70, v71, v70, s[0:1]
-  ; GCN-NEXT:    v_max_f32_e32 v70, v70, v70
-  ; GCN-NEXT:    v_max_f32_e32 v72, v81, v70
-  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v72
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v72
-  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v72
+  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[0:1]
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
+  ; GCN-NEXT:    v_max_f32_e32 v65, v82, v64
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v65
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v65
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v65
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v65
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
-  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v72
-  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v72
-  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v72
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v73, v16
-  ; GCN-NEXT:    v_exp_f32_e32 v74, v18
-  ; GCN-NEXT:    v_exp_f32_e32 v75, v19
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v65
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v65
+  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v65
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v17
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v19
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
   ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v73
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v81, v21
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v74
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v82, v22
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v75
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_sub_f32_e32 v24, v67, v72
-  ; GCN-NEXT:    v_exp_f32_e32 v83, v23
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v72
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v24, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v73
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v25, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v81
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v65
   ; GCN-NEXT:    v_exp_f32_e32 v85, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
-  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
-  ; GCN-NEXT:    ds_read_b128 v[18:21], v84
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v82
+  ; GCN-NEXT:    v_pack_b32_f16 v24, v16, v18
+  ; GCN-NEXT:    v_sub_f32_e32 v22, v67, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+  ; GCN-NEXT:    v_pack_b32_f16 v25, v20, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v19
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v16, v24
-  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v86, v23
+  ; GCN-NEXT:    v_exp_f32_e32 v64, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[16:17], v[24:25], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v16, 0, v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v83
+  ; GCN-NEXT:    v_fma_f32 v88, s4, v28, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v84
+  ; GCN-NEXT:    v_fma_f32 v91, s4, v29, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v21
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v87 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
-  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
-  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
-  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v17, v18
-  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v30, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v30, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v31, -v72
-  ; GCN-NEXT:    v_perm_b32 v31, v68, v64, s2
-  ; GCN-NEXT:    v_perm_b32 v64, v68, v64, s3
-  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
-  ; GCN-NEXT:    ds_read_b128 v[26:29], v91
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_perm_b32 v99, v70, v68, s2
+  ; GCN-NEXT:    v_perm_b32 v100, v70, v68, s3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[24:25], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v93, v73, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v94, v85
+  ; GCN-NEXT:    v_fma_f32 v95, s4, v30, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v96, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v97, v86
+  ; GCN-NEXT:    v_fma_f32 v98, s4, v31, -v65
+  ; GCN-NEXT:    v_perm_b32 v101, v71, v69, s2
+  ; GCN-NEXT:    v_perm_b32 v102, v71, v69, s3
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v91 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v76, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v31, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_exp_f32_e32 v31, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_pack_b32_f16 v18, v19, v86
-  ; GCN-NEXT:    v_pack_b32_f16 v19, v22, v89
+  ; GCN-NEXT:    ds_write_b32 v77, v99
+  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v76, v76, v90
+  ; GCN-NEXT:    v_pack_b32_f16 v77, v94, v97
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v77, v64
+  ; GCN-NEXT:    ds_write_b32 v78, v100
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v90
+  ; GCN-NEXT:    ds_write_b32 v79, v101
+  ; GCN-NEXT:    v_mul_f32_e32 v78, 0x3fb8aa3b, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v79, 0x3fb8aa3b, v91
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v81, v81, v93
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v89
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v91, v78
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v78, v92
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v93, v79
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[76:77], v[32:47]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v73
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v87
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v74, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v85
-  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v22, v64
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v88
-  ; GCN-NEXT:    v_exp_f32_e32 v64, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v75, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v30
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v23, v23
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v31
+  ; GCN-NEXT:    ds_write_b32 v80, v102
+  ; GCN-NEXT:    v_mul_f32_e32 v80, 0x3fb8aa3b, v95
+  ; GCN-NEXT:    v_add_f32_e32 v76, v82, v81
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v96
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v80
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v79, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v98
+  ; GCN-NEXT:    v_fma_f32 v81, s4, v3, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v88
   ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v20, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v18, v19
-  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v80, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v22
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v4, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v27, v3
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v64
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v5, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v90, v78
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v77, v79
   ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-  ; GCN-NEXT:    v_add_f32_e32 v17, v81, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v23
-  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v68, v2
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v25
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v83, v76
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v91
+  ; GCN-NEXT:    v_fma_f32 v83, s4, v4, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v90, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v93
+  ; GCN-NEXT:    v_fma_f32 v94, s4, v5, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v88
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v84, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v80
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v82
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v69, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v81
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v84
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v18, v4
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v19
-  ; GCN-NEXT:    v_exp_f32_e32 v24, v24
-  ; GCN-NEXT:    ds_read_b128 v[18:21], v84 offset:576
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v73
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v69
+  ; GCN-NEXT:    ds_read_b128 v[76:79], v87 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v26, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v82, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v27
-  ; GCN-NEXT:    v_exp_f32_e32 v26, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v65
-  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v83
+  ; GCN-NEXT:    v_mul_f32_e32 v81, 0x3fb8aa3b, v94
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[4:5], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v85, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v70, v90
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v71, v69
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v88
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v81
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[4:5], v[32:47]
   ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v83, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v68
-  ; GCN-NEXT:    v_exp_f32_e32 v6, v6
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v24
+  ; GCN-NEXT:    v_add_f32_e32 v68, v86, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v72
+  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v6
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v73
   ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v7
-  ; GCN-NEXT:    v_exp_f32_e32 v7, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v28, v29
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v69
-  ; GCN-NEXT:    ; implicit-def: $sgpr2
-  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_fma_f32 v75, s4, v11, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v70, v69
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v9
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v85, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v4, v88, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v89, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v71
+  ; GCN-NEXT:    v_fma_f32 v70, s4, v12, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v81
+  ; GCN-NEXT:    v_fma_f32 v86, s4, v13, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v87, v8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v76, v92, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v6
-  ; GCN-NEXT:    v_exp_f32_e32 v10, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v17, v28
-  ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v30, v4
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v31, v2
-  ; GCN-NEXT:    v_add_f32_e32 v0, v22, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v64, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v23, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v25, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v27, v0
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
-  ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
-  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
-  ; GCN-NEXT:    v_add_f32_e32 v0, v68, v0
-  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
-  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
-  ; GCN-NEXT:    v_add_f32_e32 v0, v24, v0
-  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
-  ; GCN-NEXT:    v_add_f32_e32 v0, v26, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v67, v0
-  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v5
-  ; GCN-NEXT:    v_add_f32_e32 v0, v6, v0
-  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
-  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v1
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v14
-  ; GCN-NEXT:    v_add_f32_e32 v0, v7, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
-  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
-  ; GCN-NEXT:    v_exp_f32_e32 v18, v1
-  ; GCN-NEXT:    v_add_f32_e32 v6, v8, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v91
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v74
+  ; GCN-NEXT:    v_fma_f32 v77, s4, v14, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v92, v83
+  ; GCN-NEXT:    v_pack_b32_f16 v68, v68, v85
+  ; GCN-NEXT:    v_mul_f32_e32 v75, 0x3fb8aa3b, v75
+  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v70
+  ; GCN-NEXT:    v_pack_b32_f16 v69, v69, v92
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v15, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v75
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v76, v96, v76
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v84
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v86
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v87
+  ; GCN-NEXT:    v_exp_f32_e32 v94, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[68:69], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v67, v67, v76
+  ; GCN-NEXT:    v_add_f32_e32 v67, v91, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v93, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v80, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v82, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v90, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v88, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v72, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v68, 0x3fb8aa3b, v77
+  ; GCN-NEXT:    v_add_f32_e32 v67, v73, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v89
+  ; GCN-NEXT:    v_exp_f32_e32 v78, v68
+  ; GCN-NEXT:    v_add_f32_e32 v67, v71, v67
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
-  ; GCN-NEXT:    v_add_f32_e32 v6, v9, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v13
-  ; GCN-NEXT:    v_add_f32_e32 v6, v10, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v15
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v12
-  ; GCN-NEXT:    v_add_f32_e32 v6, v11, v6
-  ; GCN-NEXT:    v_add_f32_e32 v6, v12, v6
-  ; GCN-NEXT:    v_add_f32_e32 v1, v15, v6
-  ; GCN-NEXT:    v_add_f32_e32 v11, v17, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v0, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v4, v10
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v91 offset:576
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v75
+  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
+  ; GCN-NEXT:    v_add_f32_e32 v67, v81, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v74, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v77, v76, v77
+  ; GCN-NEXT:    v_pack_b32_f16 v76, v85, v86
+  ; GCN-NEXT:    v_add_f32_e32 v67, v83, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v72, v65
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v94
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v78
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v74, v92
+  ; GCN-NEXT:    v_add_f32_e32 v67, v84, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v87, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v89, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v75, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v69, v68, v72
+  ; GCN-NEXT:    v_pack_b32_f16 v68, v74, v73
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v67, v92, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v94, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v78, v67
+  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
+  ; GCN-NEXT:    ds_bpermute_b32 v67, v66, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
+  ; GCN-NEXT:    ds_bpermute_b32 v66, v66, v65
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v18, v11
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    v_mov_b32_e32 v67, 0
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
-  ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
+  ; GCN-NEXT:    v_cndmask_b32_e64 v65, v66, v65, s[0:1]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_fmac_f32_e32 v65, v67, v64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[68:69], v[32:47]
   ; GCN-NEXT:    s_endpgm
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 22bc62acce15d..5bef205b3698e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -427,37 +427,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
 ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 4
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -647,10 +647,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -665,19 +665,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1298,26 +1298,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
@@ -1326,26 +1326,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0)
@@ -1627,7 +1627,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v10, s2
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v11, s3
@@ -1645,8 +1645,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
@@ -1655,7 +1655,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s2
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s3
@@ -1673,8 +1673,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
@@ -1741,7 +1741,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v10, s2
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v11, s3
@@ -1759,8 +1759,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
@@ -1769,7 +1769,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s2
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s3
@@ -1787,8 +1787,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index ab0000f6831b6..b35314b142ede 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -3187,13 +3187,9 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3214,14 +3210,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3599,13 +3595,9 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3626,14 +3618,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
@@ -4146,33 +4138,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4256,33 +4247,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; HEURRC-NEXT:    s_nop 6
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4320,33 +4310,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT:    s_nop 6
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4523,33 +4512,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4633,33 +4621,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT:    s_nop 6
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4697,33 +4684,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT:    s_nop 6
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
new file mode 100644
index 0000000000000..271b36fad2bb4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
@@ -0,0 +1,1292 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck -check-prefix=GFX942_WITHOUT %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=true %s -o - | FileCheck -check-prefix=GFX942_WITH %s
+
+--- |
+  target triple = "amdgcn-amd-amdhsa"
+
+  define amdgpu_kernel void @test_software_pipelining() #0 {
+    bb.0:
+      ret void
+  }
+
+  attributes #0 = {nounwind "amdgpu-waves-per-eu"="2"  "amdgpu-agpr-alloc"="0" "frame-pointer"="none"}
+
+...
+---
+name:            test_software_pipelining
+body:             |
+  bb.0:
+    ; GFX942_WITHOUT-LABEL: name: test_software_pipelining
+    ; GFX942_WITHOUT: renamable $vgpr115 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr109 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr110 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr108 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr111 = V_ADD_U32_e32 4096, $vgpr100, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr101 = V_ADD_U32_e32 $vgpr76, killed $vgpr52, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr112 = V_ADD_U32_e32 4096, $vgpr101, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr112, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 renamable $vgpr108, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr80_vgpr81, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr76, killed $vgpr0, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr82_vgpr83, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr92_vgpr93, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr94_vgpr95, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr108, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr80_vgpr81, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr82_vgpr83, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr108, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr80_vgpr81, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr72_vgpr73_vgpr74_vgpr75, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr82_vgpr83, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr92_vgpr93, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr94_vgpr95, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = DS_READ_B128_gfx9 renamable $vgpr108, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr80_vgpr81, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr108, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr50_vgpr51, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr92_vgpr93, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr50_vgpr51, $vgpr94_vgpr95, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr110, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, killed $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr92_vgpr93, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, killed $vgpr94_vgpr95, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr120 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr104_vgpr105, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr106_vgpr107, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr104_vgpr105, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr114 = V_ADD_U32_e32 $vgpr115, killed $vgpr16, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr104_vgpr105, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr113 = V_ADD_U32_e32 $vgpr115, killed $vgpr20, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr106_vgpr107, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr102_vgpr103, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr104_vgpr105, killed $vgpr96_vgpr97_vgpr98_vgpr99, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr78_vgpr79, $vgpr106_vgpr107, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr78_vgpr79, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr118_vgpr119_vgpr120_vgpr121 = DS_READ_B128_gfx9 killed renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr104_vgpr105, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr116 = V_ADD_U32_e32 $vgpr115, killed $vgpr56, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr106_vgpr107, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr100_vgpr101, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr115 = V_ADD_U32_e32 killed $vgpr115, killed $vgpr72, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr102_vgpr103, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr115, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279
+    ; GFX942_WITHOUT-NEXT: S_BARRIER
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr104_vgpr105, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr120_vgpr121, killed $vgpr106_vgpr107, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr120_vgpr121, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = DS_READ_B128_gfx9 renamable $vgpr108, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr106_vgpr107, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr104_vgpr105_vgpr106_vgpr107 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr112, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr108, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr2_vgpr3, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 killed renamable $vgpr110, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = DS_READ_B128_gfx9 renamable $vgpr92, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr92, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 renamable $vgpr92, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr82_vgpr83, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr82_vgpr83, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr92, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr115, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, $vgpr10_vgpr11, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr92, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr92, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr109, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr10_vgpr11, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr12_vgpr13, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr14_vgpr15, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 killed renamable $vgpr92, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr8_vgpr9, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr10_vgpr11, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279
+    ; GFX942_WITHOUT-NEXT: S_BARRIER
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr108, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = DS_READ_B128_gfx9 killed renamable $vgpr108, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0
+    ; GFX942_WITHOUT-NEXT: S_ENDPGM 0
+    ;
+    ; GFX942_WITH-LABEL: name: test_software_pipelining
+    ; GFX942_WITH: renamable $vgpr96 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr121 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr122 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr52 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr120 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr123 = V_ADD_U32_e32 4096, $vgpr97, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr102 = V_ADD_U32_e32 $vgpr52, killed $vgpr0, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr124 = V_ADD_U32_e32 4096, $vgpr102, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr124, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr52, killed $vgpr0, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr6_vgpr7, $vgpr82_vgpr83, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr92_vgpr93, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr6_vgpr7, $vgpr94_vgpr95, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr122, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr80_vgpr81, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, killed $vgpr82_vgpr83, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, killed $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr97, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr112_vgpr113, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr97, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr112_vgpr113, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr112_vgpr113, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr126 = V_ADD_U32_e32 $vgpr96, killed $vgpr16, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr100_vgpr101, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr104_vgpr105_vgpr106_vgpr107, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr125 = V_ADD_U32_e32 $vgpr96, killed $vgpr20, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr116_vgpr117_vgpr118_vgpr119, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr108_vgpr109_vgpr110_vgpr111, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr104_vgpr105_vgpr106_vgpr107, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 killed renamable $vgpr97, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr104 = V_ADD_U32_e32 $vgpr96, killed $vgpr56, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr127 = V_ADD_U32_e32 killed $vgpr96, killed $vgpr60, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr127, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: S_WAITCNT 49279
+    ; GFX942_WITH-NEXT: S_BARRIER
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr112_vgpr113, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, killed $vgpr114_vgpr115, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_BARRIER 0
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr124, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 killed renamable $vgpr122, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr105 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr105, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 renamable $vgpr105, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr105, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr84_vgpr85_vgpr86_vgpr87 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr105, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr88_vgpr89_vgpr90_vgpr91 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr127, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr105, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr105, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr121, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 killed renamable $vgpr105, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr8_vgpr9, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr12_vgpr13, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: S_WAITCNT 49279
+    ; GFX942_WITH-NEXT: S_BARRIER
+    ; GFX942_WITH-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = DS_READ_B128_gfx9 killed renamable $vgpr120, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_BARRIER 0
+    ; GFX942_WITH-NEXT: S_ENDPGM 0
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = IMPLICIT_DEF
+    %5:sgpr_128 = IMPLICIT_DEF
+    %6:sgpr_128 = IMPLICIT_DEF
+    %7:vgpr_32 = IMPLICIT_DEF
+    %8:vreg_128_align2 = IMPLICIT_DEF
+    %9:vreg_128_align2 = IMPLICIT_DEF
+    %10:vreg_128_align2 = IMPLICIT_DEF
+    %11:vreg_128_align2 = IMPLICIT_DEF
+    %12:vreg_128_align2 = IMPLICIT_DEF
+    %13:vreg_128_align2 = IMPLICIT_DEF
+    %14:vreg_128_align2 = IMPLICIT_DEF
+    %15:vreg_128_align2 = IMPLICIT_DEF
+    %16:vreg_128_align2 = IMPLICIT_DEF
+    %17:vreg_128_align2 = IMPLICIT_DEF
+    %18:vreg_128_align2 = IMPLICIT_DEF
+    %19:vreg_128_align2 = IMPLICIT_DEF
+    %20:vreg_128_align2 = IMPLICIT_DEF
+    %21:vreg_128_align2 = IMPLICIT_DEF
+    %22:vreg_128_align2 = IMPLICIT_DEF
+    %23:vreg_128_align2 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = V_ADD_U32_e32 4096, %25:vgpr_32, implicit $exec
+    %27:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %7:vgpr_32, implicit $exec
+    %26:vgpr_32 = V_ADD_U32_e32 4096, %27:vgpr_32, implicit $exec
+    %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %31:vreg_128_align2 = IMPLICIT_DEF
+    %30:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %23:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %32:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %30:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %22:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %34:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %37:vreg_128_align2 = IMPLICIT_DEF
+    %36:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %21:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %38:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %36:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %20:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %40:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %19:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %43:vgpr_32 = IMPLICIT_DEF
+    %925:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %43:vgpr_32, implicit $exec
+    %44:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %45:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %42:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %46:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %18:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %47:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %46:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %48:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    %49:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %17:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %50:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %51:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %49:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %52:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %16:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %53:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %52:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %54:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    %55:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %15:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %56:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %56:vreg_128_align2, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    %57:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %55:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %58:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %14:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %59:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %58:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %60:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    %61:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %13:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %62:vreg_128_align2, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    %63:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %61:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %64:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %12:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %65:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %64:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %66:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    %67:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %11:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %68:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %69:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %67:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %70:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %10:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %71:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %70:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %72:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    %73:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %9:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %74:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %75:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %73:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %76:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %8:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %77:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %76:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %79:vgpr_32 = IMPLICIT_DEF
+    %78:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    %81:vreg_128_align2 = IMPLICIT_DEF
+    %80:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %33:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %82:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    %83:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %80:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %35:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %84:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %39:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    %89:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %87:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %41:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %90:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %45:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %94:vgpr_32 = IMPLICIT_DEF
+    %93:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %94:vgpr_32, implicit $exec
+    %95:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %96:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %92:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %97:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %47:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %98:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %97:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %99:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    %100:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %51:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %102:vgpr_32 = IMPLICIT_DEF
+    %101:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %102:vgpr_32, implicit $exec
+    %103:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %104:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %100:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %105:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %53:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %106:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %105:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %107:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    %108:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %57:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %109:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %109:vreg_128_align2, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
+    %110:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %108:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %111:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %59:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %112:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %111:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %113:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    %114:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %63:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %115:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %115:vreg_128_align2, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
+    %116:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %114:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %117:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %65:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %118:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %117:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %119:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    %120:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %69:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %122:vgpr_32 = IMPLICIT_DEF
+    %121:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %122:vgpr_32, implicit $exec
+    %123:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %124:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %120:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %125:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %71:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %127:vgpr_32 = IMPLICIT_DEF
+    %126:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %127:vgpr_32, implicit $exec
+    %128:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %125:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %129:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    S_WAITCNT 49279
+    S_BARRIER
+    %130:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    %131:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %75:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %132:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %131:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %133:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %77:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %134:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %133:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %135:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_BARRIER 0
+    %136:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %83:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %137:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    %138:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %136:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %139:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %86:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %140:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %139:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %141:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %89:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %142:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    %143:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %141:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %144:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %91:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %145:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %144:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %146:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %96:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %147:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %146:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %148:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %98:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %149:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %148:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %150:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    %151:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %104:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %152:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %151:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %153:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %106:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %154:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %153:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %155:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    %156:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %110:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %95:vreg_128_align2, 0, 0, implicit $exec :: (store (s128), addrspace 3)
+    %157:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %156:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %158:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %112:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %159:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %158:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %160:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    %161:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %116:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %103:vreg_128_align2, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
+    %162:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %161:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %163:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %118:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %164:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %163:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %165:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    %166:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %124:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %981:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %167:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %166:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %168:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %128:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %169:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %168:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %170:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    %171:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %132:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %985:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %172:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %171:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %173:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %134:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %174:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %173:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %176:vgpr_32 = IMPLICIT_DEF
+    %175:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    %177:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %138:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %178:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    %179:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %177:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %180:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %140:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %962:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %180:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %182:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %143:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %183:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    %961:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %182:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %185:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %145:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %960:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %185:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %187:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %147:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %956:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %959:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %187:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %189:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %149:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %958:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %189:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %191:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    %192:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %152:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %962:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %957:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %192:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %194:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %154:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %956:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %194:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %196:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    %197:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %157:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %123:vreg_128_align2, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
+    %955:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %197:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %199:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %159:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %954:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %199:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %201:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    %202:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %162:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %129:vreg_128_align2, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
+    %953:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %202:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %204:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %164:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %952:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %204:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %206:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    %207:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %167:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %910:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %951:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %207:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %209:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %169:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %950:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %209:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %911:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    S_WAITCNT 49279
+    S_BARRIER
+    %937:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    %211:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %172:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %949:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %211:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %213:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %174:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %948:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %213:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %931:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_BARRIER 0
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 7e30af96bb8b9..d9f1b542e4cb4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -799,17 +799,17 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1155,8 +1155,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2005,21 +2005,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -2395,21 +2395,21 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -3304,17 +3304,17 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
@@ -3494,19 +3494,19 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1)
 ;
 ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
 ; GFX942-VGPR:       ; %bb.0:
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x41
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3)
@@ -4309,7 +4309,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4318,9 +4318,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v6, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -5017,12 +5017,12 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v0, v1, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -5542,6 +5542,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v1
@@ -5570,39 +5572,37 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[30:31]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[30:31], v[28:29]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[28:29], v[26:27]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[26:27], v[24:25]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[24:25], v[22:23]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[22:23], v[20:21]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[20:21], v[18:19]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], v[16:17]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[62:63], v[30:31]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v64, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[60:61], v[28:29]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[58:59], v[26:27]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[56:57], v[24:25]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[54:55], v[22:23]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[52:53], v[20:21]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[50:51], v[18:19]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[48:49], v[16:17]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[46:47], v[14:15]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[44:45], v[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[42:43], v[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[40:41], v[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[38:39], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[36:37], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[34:35], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[0:1]
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[32:63], v0, v64, v[32:63]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[60:63], s[0:1] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[56:59], s[0:1] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[52:55], s[0:1] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[48:51], s[0:1] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[44:47], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[40:43], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[36:39], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[32:35], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
@@ -5695,20 +5695,20 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -5804,19 +5804,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index f0205a3a788ed..f4f1ca024b7d6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -5101,35 +5101,35 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; SDAG-NEXT:    v_mov_b64_e32 v[20:21], 48
 ; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[24:25], 16
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[38:39], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5137,6 +5137,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
@@ -5154,28 +5157,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], 48
+; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
@@ -5191,23 +5199,23 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-NEXT:    v_mov_b32_e32 v32, 42
 ; SDAG-NEXT:    v_mov_b32_e32 v33, 25
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    v_mov_b32_e32 v20, s16
-; SDAG-NEXT:    v_mov_b32_e32 v21, s17
-; SDAG-NEXT:    v_mov_b32_e32 v22, s18
-; SDAG-NEXT:    v_mov_b32_e32 v23, s19
-; SDAG-NEXT:    v_mov_b32_e32 v24, s20
-; SDAG-NEXT:    v_mov_b32_e32 v25, s21
-; SDAG-NEXT:    v_mov_b32_e32 v26, s22
-; SDAG-NEXT:    v_mov_b32_e32 v27, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s12
+; SDAG-NEXT:    v_mov_b32_e32 v1, s13
+; SDAG-NEXT:    v_mov_b32_e32 v2, s14
+; SDAG-NEXT:    v_mov_b32_e32 v3, s15
+; SDAG-NEXT:    v_mov_b32_e32 v4, s16
+; SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; SDAG-NEXT:    v_mov_b32_e32 v6, s18
+; SDAG-NEXT:    v_mov_b32_e32 v7, s19
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
+; SDAG-NEXT:    v_mov_b32_e32 v9, s21
+; SDAG-NEXT:    v_mov_b32_e32 v10, s22
+; SDAG-NEXT:    v_mov_b32_e32 v11, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v28, s24
-; SDAG-NEXT:    v_mov_b32_e32 v29, s25
-; SDAG-NEXT:    v_mov_b32_e32 v30, s26
-; SDAG-NEXT:    v_mov_b32_e32 v31, s27
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v13, s25
+; SDAG-NEXT:    v_mov_b32_e32 v14, s26
+; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
@@ -5242,19 +5250,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5265,6 +5287,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    v_mov_b32_e32 v32, 25
 ; GISEL-NEXT:    v_mov_b32_e32 v33, 42
 ; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[38:39]
@@ -5296,20 +5321,20 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 5475fa2ae5c6e..ef3bb0cb5f4f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-SDAG-NEXT:    s_nop 1
-; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    s_nop 6
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32_vgprcd:
@@ -87,14 +87,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, 4.0
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-GISEL-NEXT:    s_nop 1
-; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-GISEL-NEXT:    s_nop 5
-; GFX942-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    s_nop 6
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 8803f3ae4906f..9db40d2067226 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -506,13 +506,13 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5]
+; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[6:9], v0, v1, v[2:5]
 ; CHECK-NEXT:    s_nop 3
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[8:9] op_sel:[1,0]
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
-; CHECK-NEXT:    v_accvgpr_write_b32 a2, v3
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, v9
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use a[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -711,15 +711,18 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v12, v31
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1]
-; CHECK-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
+; CHECK-NEXT:    v_and_b32_e32 v12, 0x3ff, v12
+; CHECK-NEXT:    s_nop 2
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
 ; CHECK-NEXT:    s_nop 8
 ; CHECK-NEXT:    global_store_dwordx2 v[2:3], a[0:1], off
+; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 3, v12
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5]
+; CHECK-NEXT:    s_nop 5
+; CHECK-NEXT:    global_store_dwordx2 v[4:5], a[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %src2 = call double asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
index a81d9a458e23a..e77856d073a0b 100644
--- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
@@ -311,43 +311,44 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[12:15]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    global_store_dwordx4 v0, v[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v6, v[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[56:59], s[16:17] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v6, v[56:59], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v6, v[52:55], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v6, v[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v6, v[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v6, v[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v6, v[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v6, v[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[56:59], s[16:17] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v6, a[56:59], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v6, a[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v6, a[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v6, a[52:55], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v6, a[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v6, a[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v6, a[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v6, a[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[8:11], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)

>From 474f9f0a85b0ad19808fa408024efd161403100f Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 19 Sep 2025 19:47:42 -0400
Subject: [PATCH 02/20] Rebase

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 6d2b10bdb5804..ed349fccfa3e4 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -48,7 +48,7 @@ static cl::opt<bool> EnableRegisterAvoidListForMFMARegs(
     "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden,
     cl::desc("Enable Register Avoidance for "
              "MFMA in GCNPreRAOptimizations stage."),
-    cl::init(true));
+    cl::init(false));
 
 namespace {
 

>From e212943ab329e33e4e22c1d616c437a2914c4ce3 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 19 Sep 2025 20:05:14 -0400
Subject: [PATCH 03/20] rebase test files

---
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir |  21 +-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 542 +++++++++---------
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 104 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 270 ++++-----
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  | 146 ++---
 ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 159 +++--
 .../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll    |  12 +-
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       |  19 +-
 .../unspill-vgpr-after-rewrite-vgpr-mfma.ll   |  33 +-
 9 files changed, 642 insertions(+), 664 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index 8fbfe2e591dfe..01b24dfd79941 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -15,12 +15,9 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr106
   ; GCN-NEXT:    ; implicit-def: $vgpr132
-  ; GCN-NEXT:    ; implicit-def: $vgpr112
-  ; GCN-NEXT:    ; implicit-def: $vgpr113
-  ; GCN-NEXT:    ; implicit-def: $vgpr114
-  ; GCN-NEXT:    ; implicit-def: $vgpr115
   ; GCN-NEXT:    ; implicit-def: $vgpr133
   ; GCN-NEXT:    ; implicit-def: $vgpr139
+  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    ; implicit-def: $vgpr16
@@ -225,9 +222,6 @@
   ; GCN-NEXT:    buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
-  ; GCN-NEXT:    v_add_u32_e32 v73, v132, v112
   ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79]
   ; GCN-NEXT:    buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1
@@ -242,12 +236,15 @@
   ; GCN-NEXT:    buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v114
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr74
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v74
+  ; GCN-NEXT:    ; implicit-def: $vgpr75
   ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v115
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v75
   ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -255,8 +252,6 @@
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; kill: killed $vgpr73
   ; GCN-NEXT:    ds_read_b128 v[72:75], v94
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79]
   ; GCN-NEXT:    v_perm_b32 v238, v162, v160, s5
@@ -267,11 +262,9 @@
   ; GCN-NEXT:    ds_read_b128 v[160:163], v213
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ; implicit-def: $sgpr8
-  ; GCN-NEXT:    ; implicit-def: $vgpr112
-  ; GCN-NEXT:    ; implicit-def: $vgpr113
-  ; GCN-NEXT:    ; implicit-def: $vgpr114
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
   ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index be97a1e82fcf2..0887fdf0844b0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -10,24 +10,25 @@
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
   ; GCN-NEXT:    v_readfirstlane_b32 s20, v2
   ; GCN-NEXT:    ; implicit-def: $sgpr4
-  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    ; implicit-def: $vgpr3
   ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:    ; implicit-def: $vgpr76
+  ; GCN-NEXT:    ; implicit-def: $vgpr50
   ; GCN-NEXT:    ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; implicit-def: $vgpr49
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
-  ; GCN-NEXT:    ; implicit-def: $vgpr50
+  ; GCN-NEXT:    ; implicit-def: $vgpr51
+  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
+  ; GCN-NEXT:    ; implicit-def: $vgpr76
   ; GCN-NEXT:    ; implicit-def: $vgpr77
   ; GCN-NEXT:    ; implicit-def: $vgpr78
   ; GCN-NEXT:    ; implicit-def: $vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr80
-  ; GCN-NEXT:    ; implicit-def: $vgpr81
-  ; GCN-NEXT:    ; implicit-def: $vgpr103
+  ; GCN-NEXT:    ; implicit-def: $vgpr91
   ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v64
+  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v3
   ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
   ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -35,9 +36,8 @@
   ; GCN-NEXT:    s_lshl_b32 s4, s20, 7
   ; GCN-NEXT:    ; implicit-def: $vgpr5
   ; GCN-NEXT:    v_add_lshl_u32 v48, v5, s4, 1
-  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
-  ; GCN-NEXT:    v_add_u32_e32 v77, s20, v77
-  ; GCN-NEXT:    v_and_b32_e32 v77, 0x1fffffff, v77
+  ; GCN-NEXT:    v_add_u32_e32 v76, s20, v76
+  ; GCN-NEXT:    v_and_b32_e32 v76, 0x1fffffff, v76
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b128 v48, v[0:3]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -48,8 +48,8 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   ; GCN-NEXT:    ; implicit-def: $sgpr6
-  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v76
-  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v76
+  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v50
+  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v50
   ; GCN-NEXT:    buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -68,22 +68,22 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
   ; GCN-NEXT:    ; kill: killed $vgpr1
   ; GCN-NEXT:    ; kill: killed $vgpr0
-  ; GCN-NEXT:    v_mul_lo_u32 v77, v77, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v77, v78, v77, 1
-  ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
+  ; GCN-NEXT:    v_mul_lo_u32 v76, v76, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v76, v77, v76, 1
+  ; GCN-NEXT:    v_lshl_add_u32 v77, v78, 1, v76
   ; GCN-NEXT:    ; implicit-def: $sgpr5
-  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
+  ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
   ; GCN-NEXT:    ; implicit-def: $sgpr2
   ; GCN-NEXT:    ; implicit-def: $sgpr3
-  ; GCN-NEXT:    v_lshl_add_u32 v80, v81, 1, v79
+  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v50
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v51
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v50 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v51 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
@@ -107,20 +107,20 @@
   ; GCN-NEXT:    ds_read_b128 v[40:43], v49 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v50
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v51
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
   ; GCN-NEXT:    ; implicit-def: $vgpr32
   ; GCN-NEXT:    ; implicit-def: $vgpr33
-  ; GCN-NEXT:    v_add_u32_e32 v83, v32, v76
-  ; GCN-NEXT:    v_add_u32_e32 v76, v33, v76
+  ; GCN-NEXT:    v_add_u32_e32 v82, v32, v50
+  ; GCN-NEXT:    v_add_u32_e32 v83, v33, v50
+  ; GCN-NEXT:    ; kill: killed $vgpr82
   ; GCN-NEXT:    ; kill: killed $vgpr83
-  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[66:69], v50 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[66:69], v51 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
@@ -131,20 +131,20 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
   ; GCN-NEXT:    ; implicit-def: $vgpr66
   ; GCN-NEXT:    ; implicit-def: $vgpr67
-  ; GCN-NEXT:    v_max_f32_e32 v82, v67, v67
+  ; GCN-NEXT:    v_max_f32_e32 v81, v67, v67
   ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
   ; GCN-NEXT:    v_perm_b32 v70, v74, v72, s2
   ; GCN-NEXT:    v_perm_b32 v71, v74, v72, s3
   ; GCN-NEXT:    v_perm_b32 v72, v75, v73, s2
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v77, v70
+  ; GCN-NEXT:    ds_write_b32 v76, v70
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v71
+  ; GCN-NEXT:    ds_write_b32 v77, v71
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v72
+  ; GCN-NEXT:    ds_write_b32 v78, v72
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v20
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
   ; GCN-NEXT:    v_mul_f32_e32 v64, s4, v16
@@ -152,11 +152,11 @@
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
   ; GCN-NEXT:    v_max3_f32 v64, v64, s5, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v21
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v22
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v23
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v24
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v25
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
@@ -166,12 +166,12 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v28
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v29
   ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v30
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v31
   ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v0
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v1
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v81, v84
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v80, v84
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v2
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v3
   ; GCN-NEXT:    v_max3_f32 v64, v64, v85, v86
@@ -179,315 +179,315 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v5
   ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v65
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v7
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v7
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v8
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v9
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v10
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v11
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v12
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v13
   ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v65
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v68
-  ; GCN-NEXT:    v_perm_b32 v68, v75, v73, s3
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v68
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
+  ; GCN-NEXT:    v_perm_b32 v68, v75, v73, s3
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v80, v68
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
+  ; GCN-NEXT:    ds_write_b32 v79, v68
+  ; GCN-NEXT:    ; implicit-def: $vgpr84
+  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v70, v64, v65
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[70:71], v76, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
+  ; GCN-NEXT:    ds_bpermute_b32 v71, v66, v70
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ; implicit-def: $vgpr87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
-  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[0:1]
-  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
-  ; GCN-NEXT:    v_max_f32_e32 v65, v82, v64
-  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v65
-  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v65
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v65
-  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v65
+  ; GCN-NEXT:    v_cndmask_b32_e64 v70, v71, v70, s[0:1]
+  ; GCN-NEXT:    v_max_f32_e32 v70, v70, v70
+  ; GCN-NEXT:    v_max_f32_e32 v72, v81, v70
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v72
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v72
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v72
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v65
-  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v65
-  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v65
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v72, v16
-  ; GCN-NEXT:    v_exp_f32_e32 v73, v17
-  ; GCN-NEXT:    v_exp_f32_e32 v81, v18
-  ; GCN-NEXT:    v_exp_f32_e32 v82, v19
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v72
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v72
+  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v72
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v19
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
   ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v72
-  ; GCN-NEXT:    v_fma_f32 v17, s4, v24, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v83, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v73
-  ; GCN-NEXT:    v_fma_f32 v19, s4, v25, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v84, v21
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v81
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v85, v22
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v82
-  ; GCN-NEXT:    v_pack_b32_f16 v24, v16, v18
-  ; GCN-NEXT:    v_sub_f32_e32 v22, v67, v65
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v73
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v74
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v22
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v75
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_pack_b32_f16 v25, v20, v21
-  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v19
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v87
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
+  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_sub_f32_e32 v24, v67, v72
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v23
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v85, v22
+  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
+  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
+  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
+  ; GCN-NEXT:    ds_read_b128 v[18:21], v84
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v86, v23
-  ; GCN-NEXT:    v_exp_f32_e32 v64, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[16:17], v[24:25], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v16, 0, v72
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v83
-  ; GCN-NEXT:    v_fma_f32 v88, s4, v28, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v89, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v84
-  ; GCN-NEXT:    v_fma_f32 v91, s4, v29, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v92, v21
-  ; GCN-NEXT:    ds_read_b128 v[20:23], v87 offset:576
+  ; GCN-NEXT:    v_exp_f32_e32 v16, v24
+  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_perm_b32 v99, v70, v68, s2
-  ; GCN-NEXT:    v_perm_b32 v100, v70, v68, s3
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[24:25], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v93, v73, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v94, v85
-  ; GCN-NEXT:    v_fma_f32 v95, s4, v30, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v96, v16
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v97, v86
-  ; GCN-NEXT:    v_fma_f32 v98, s4, v31, -v65
-  ; GCN-NEXT:    v_perm_b32 v101, v71, v69, s2
-  ; GCN-NEXT:    v_perm_b32 v102, v71, v69, s3
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
+  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
+  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
+  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v17, v18
+  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v30, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v30, v18
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v31, -v72
+  ; GCN-NEXT:    v_perm_b32 v31, v68, v64, s2
+  ; GCN-NEXT:    v_perm_b32 v64, v68, v64, s3
+  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
+  ; GCN-NEXT:    ds_read_b128 v[26:29], v91
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v91 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v77, v99
-  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
-  ; GCN-NEXT:    v_pack_b32_f16 v76, v76, v90
-  ; GCN-NEXT:    v_pack_b32_f16 v77, v94, v97
+  ; GCN-NEXT:    ds_write_b32 v76, v31
+  ; GCN-NEXT:    v_mul_f32_e32 v31, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_exp_f32_e32 v31, v31
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_pack_b32_f16 v18, v19, v86
+  ; GCN-NEXT:    v_pack_b32_f16 v19, v22, v89
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v100
+  ; GCN-NEXT:    ds_write_b32 v77, v64
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v101
-  ; GCN-NEXT:    v_mul_f32_e32 v78, 0x3fb8aa3b, v88
-  ; GCN-NEXT:    v_mul_f32_e32 v79, 0x3fb8aa3b, v91
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v81, v81, v93
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v89
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v91, v78
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v78, v92
-  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v93, v79
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[76:77], v[32:47]
+  ; GCN-NEXT:    ds_write_b32 v78, v90
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v80, v102
-  ; GCN-NEXT:    v_mul_f32_e32 v80, 0x3fb8aa3b, v95
-  ; GCN-NEXT:    v_add_f32_e32 v76, v82, v81
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v96
-  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v80, v80
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v79, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v98
-  ; GCN-NEXT:    v_fma_f32 v81, s4, v3, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v82, v88
+  ; GCN-NEXT:    ds_write_b32 v79, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v73
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v87
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v74, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v85
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v22, v64
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v88
+  ; GCN-NEXT:    v_exp_f32_e32 v64, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v75, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v30
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v23, v23
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v31
   ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v90, v78
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v77, v79
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v20, v21
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v18, v19
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v80, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v22
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v4, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v27, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v64
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v5, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
   ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+  ; GCN-NEXT:    v_add_f32_e32 v17, v81, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v23
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v68, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v25
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ; implicit-def: $sgpr2
-  ; GCN-NEXT:    s_nop 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v68, v83, v76
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v91
-  ; GCN-NEXT:    v_fma_f32 v83, s4, v4, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v90, v3
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v93
-  ; GCN-NEXT:    v_fma_f32 v94, s4, v5, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v88, v88
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v68, v84, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v80
-  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v72, v2
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v82
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v69, v4
-  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v81
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v87
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v84
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v73
-  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v73, v69
-  ; GCN-NEXT:    ds_read_b128 v[76:79], v87 offset:576
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v18, v4
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v19
+  ; GCN-NEXT:    v_exp_f32_e32 v24, v24
+  ; GCN-NEXT:    ds_read_b128 v[18:21], v84 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v83
-  ; GCN-NEXT:    v_mul_f32_e32 v81, 0x3fb8aa3b, v94
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v68, v85, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v70, v90
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v71, v69
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v88
-  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v81, v81
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v26, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v82, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v27
+  ; GCN-NEXT:    v_exp_f32_e32 v26, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v65
+  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
   ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
-  ; GCN-NEXT:    v_add_f32_e32 v68, v86, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v72
-  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v74, v6
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v73
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v83, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v68
+  ; GCN-NEXT:    v_exp_f32_e32 v6, v6
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v24
   ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v7
-  ; GCN-NEXT:    v_fma_f32 v75, s4, v11, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v83, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v70, v69
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v8
-  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v9
+  ; GCN-NEXT:    v_exp_f32_e32 v7, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v28, v29
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v69
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    s_nop 1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v89, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v71
-  ; GCN-NEXT:    v_fma_f32 v70, s4, v12, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v84, v7
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v81
-  ; GCN-NEXT:    v_fma_f32 v86, s4, v13, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v87, v8
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v76, v92, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v85, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v4, v88, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v74
-  ; GCN-NEXT:    v_fma_f32 v77, s4, v14, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v89, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v92, v83
-  ; GCN-NEXT:    v_pack_b32_f16 v68, v68, v85
-  ; GCN-NEXT:    v_mul_f32_e32 v75, 0x3fb8aa3b, v75
-  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v70
-  ; GCN-NEXT:    v_pack_b32_f16 v69, v69, v92
-  ; GCN-NEXT:    v_fma_f32 v65, s4, v15, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v75, v75
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[68:69], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v76, v96, v76
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v84
-  ; GCN-NEXT:    v_exp_f32_e32 v92, v70
-  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v86
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v87
-  ; GCN-NEXT:    v_exp_f32_e32 v94, v70
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v65
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[68:69], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v67, v67, v76
-  ; GCN-NEXT:    v_add_f32_e32 v67, v91, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v93, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v80, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v82, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v90, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v88, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v72, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v68, 0x3fb8aa3b, v77
-  ; GCN-NEXT:    v_add_f32_e32 v67, v73, v67
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v89
-  ; GCN-NEXT:    v_exp_f32_e32 v78, v68
-  ; GCN-NEXT:    v_add_f32_e32 v67, v71, v67
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v6
+  ; GCN-NEXT:    v_exp_f32_e32 v10, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v17, v28
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v2, v30, v4
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v0, v31, v2
+  ; GCN-NEXT:    v_add_f32_e32 v0, v22, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v64, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v23, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v25, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v27, v0
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
+  ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_add_f32_e32 v0, v68, v0
+  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
+  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
+  ; GCN-NEXT:    v_add_f32_e32 v0, v24, v0
+  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
+  ; GCN-NEXT:    v_add_f32_e32 v0, v26, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v67, v0
+  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v5
+  ; GCN-NEXT:    v_add_f32_e32 v0, v6, v0
+  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
+  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
+  ; GCN-NEXT:    v_exp_f32_e32 v17, v1
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v14
+  ; GCN-NEXT:    v_add_f32_e32 v0, v7, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
+  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
+  ; GCN-NEXT:    v_exp_f32_e32 v18, v1
+  ; GCN-NEXT:    v_add_f32_e32 v6, v8, v0
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v91
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v75
-  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
-  ; GCN-NEXT:    v_add_f32_e32 v67, v81, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v74, v67
-  ; GCN-NEXT:    v_pack_b32_f16 v77, v76, v77
-  ; GCN-NEXT:    v_pack_b32_f16 v76, v85, v86
-  ; GCN-NEXT:    v_add_f32_e32 v67, v83, v67
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v72, v65
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v94
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v78
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v74, v92
-  ; GCN-NEXT:    v_add_f32_e32 v67, v84, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v87, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v89, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v75, v67
-  ; GCN-NEXT:    v_pack_b32_f16 v69, v68, v72
-  ; GCN-NEXT:    v_pack_b32_f16 v68, v74, v73
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
+  ; GCN-NEXT:    v_add_f32_e32 v6, v9, v6
+  ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v13
+  ; GCN-NEXT:    v_add_f32_e32 v6, v10, v6
+  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v18
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v15
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v12
+  ; GCN-NEXT:    v_add_f32_e32 v6, v11, v6
+  ; GCN-NEXT:    v_add_f32_e32 v6, v12, v6
+  ; GCN-NEXT:    v_add_f32_e32 v1, v15, v6
+  ; GCN-NEXT:    v_add_f32_e32 v11, v17, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v0, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v4, v10
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v91 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_f32_e32 v67, v92, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v94, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v78, v67
-  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
-  ; GCN-NEXT:    ds_bpermute_b32 v67, v66, v65
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
-  ; GCN-NEXT:    ds_bpermute_b32 v66, v66, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mov_b32_e32 v67, 0
+  ; GCN-NEXT:    v_mov_b32_e32 v4, 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v2, v18, v11
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v65, v66, v65, s[0:1]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[68:69], v[48:63]
-  ; GCN-NEXT:    v_fmac_f32_e32 v65, v67, v64
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[68:69], v[32:47]
+  ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+  ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
   ; GCN-NEXT:    s_endpgm
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 5bef205b3698e..ed3d1399e5926 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -427,37 +427,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
 ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 4
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -647,10 +647,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -665,19 +665,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1298,26 +1298,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
@@ -1326,26 +1326,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0)
@@ -1645,8 +1645,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
@@ -1673,8 +1673,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
@@ -1759,8 +1759,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
@@ -1787,8 +1787,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index b35314b142ede..ab0000f6831b6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -3187,9 +3187,13 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
+; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
+; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3210,14 +3214,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3595,9 +3599,13 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
+; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
+; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3618,14 +3626,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
@@ -4138,32 +4146,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s16
-; SDAG-NEXT:    v_mov_b32_e32 v33, s17
-; SDAG-NEXT:    v_mov_b32_e32 v34, s18
-; SDAG-NEXT:    v_mov_b32_e32 v35, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s12
-; SDAG-NEXT:    v_mov_b32_e32 v33, s13
-; SDAG-NEXT:    v_mov_b32_e32 v34, s14
-; SDAG-NEXT:    v_mov_b32_e32 v35, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s8
-; SDAG-NEXT:    v_mov_b32_e32 v33, s9
-; SDAG-NEXT:    v_mov_b32_e32 v34, s10
-; SDAG-NEXT:    v_mov_b32_e32 v35, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4247,32 +4256,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    s_nop 6
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4310,32 +4320,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    s_nop 6
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4512,32 +4523,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s16
-; SDAG-NEXT:    v_mov_b32_e32 v33, s17
-; SDAG-NEXT:    v_mov_b32_e32 v34, s18
-; SDAG-NEXT:    v_mov_b32_e32 v35, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s12
-; SDAG-NEXT:    v_mov_b32_e32 v33, s13
-; SDAG-NEXT:    v_mov_b32_e32 v34, s14
-; SDAG-NEXT:    v_mov_b32_e32 v35, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s8
-; SDAG-NEXT:    v_mov_b32_e32 v33, s9
-; SDAG-NEXT:    v_mov_b32_e32 v34, s10
-; SDAG-NEXT:    v_mov_b32_e32 v35, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4621,32 +4633,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    s_nop 6
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4684,32 +4697,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    s_nop 6
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index d9f1b542e4cb4..7e30af96bb8b9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -799,17 +799,17 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1155,8 +1155,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2005,21 +2005,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -2395,21 +2395,21 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -3304,17 +3304,17 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
@@ -3494,19 +3494,19 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1)
 ;
 ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
 ; GFX942-VGPR:       ; %bb.0:
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x41
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3)
@@ -4309,7 +4309,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4318,9 +4318,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v6, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -5017,12 +5017,12 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v0, v1, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -5542,8 +5542,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v1
@@ -5572,37 +5570,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[62:63], v[30:31]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v64, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[60:61], v[28:29]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[58:59], v[26:27]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[56:57], v[24:25]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[54:55], v[22:23]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[52:53], v[20:21]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[50:51], v[18:19]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[48:49], v[16:17]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[46:47], v[14:15]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[44:45], v[12:13]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[42:43], v[10:11]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[40:41], v[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[38:39], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[36:37], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[34:35], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[30:31]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[30:31], v[28:29]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[28:29], v[26:27]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[26:27], v[24:25]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[24:25], v[22:23]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[22:23], v[20:21]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[20:21], v[18:19]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], v[16:17]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[32:63], v0, v64, v[32:63]
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[60:63], s[0:1] offset:112
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[56:59], s[0:1] offset:96
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[52:55], s[0:1] offset:80
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[48:51], s[0:1] offset:64
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[44:47], s[0:1] offset:48
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[40:43], s[0:1] offset:32
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[36:39], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[32:35], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
@@ -5695,20 +5695,20 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
+; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -5804,19 +5804,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index f4f1ca024b7d6..f0205a3a788ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -5101,35 +5101,35 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; SDAG-NEXT:    v_mov_b64_e32 v[20:21], 48
 ; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[38:39], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
-; SDAG-NEXT:    v_mov_b32_e32 v32, s16
-; SDAG-NEXT:    v_mov_b32_e32 v33, s17
-; SDAG-NEXT:    v_mov_b32_e32 v34, s18
-; SDAG-NEXT:    v_mov_b32_e32 v35, s19
-; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s12
-; SDAG-NEXT:    v_mov_b32_e32 v33, s13
-; SDAG-NEXT:    v_mov_b32_e32 v34, s14
-; SDAG-NEXT:    v_mov_b32_e32 v35, s15
-; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s8
-; SDAG-NEXT:    v_mov_b32_e32 v33, s9
-; SDAG-NEXT:    v_mov_b32_e32 v34, s10
-; SDAG-NEXT:    v_mov_b32_e32 v35, s11
-; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5137,9 +5137,6 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
@@ -5157,33 +5154,28 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], 48
-; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[20:21]
-; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 3
-; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
@@ -5199,23 +5191,23 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-NEXT:    v_mov_b32_e32 v32, 42
 ; SDAG-NEXT:    v_mov_b32_e32 v33, 25
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; SDAG-NEXT:    v_mov_b32_e32 v5, s17
-; SDAG-NEXT:    v_mov_b32_e32 v6, s18
-; SDAG-NEXT:    v_mov_b32_e32 v7, s19
-; SDAG-NEXT:    v_mov_b32_e32 v8, s20
-; SDAG-NEXT:    v_mov_b32_e32 v9, s21
-; SDAG-NEXT:    v_mov_b32_e32 v10, s22
-; SDAG-NEXT:    v_mov_b32_e32 v11, s23
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; SDAG-NEXT:    v_mov_b32_e32 v21, s17
+; SDAG-NEXT:    v_mov_b32_e32 v22, s18
+; SDAG-NEXT:    v_mov_b32_e32 v23, s19
+; SDAG-NEXT:    v_mov_b32_e32 v24, s20
+; SDAG-NEXT:    v_mov_b32_e32 v25, s21
+; SDAG-NEXT:    v_mov_b32_e32 v26, s22
+; SDAG-NEXT:    v_mov_b32_e32 v27, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
+; SDAG-NEXT:    v_mov_b32_e32 v28, s24
+; SDAG-NEXT:    v_mov_b32_e32 v29, s25
+; SDAG-NEXT:    v_mov_b32_e32 v30, s26
+; SDAG-NEXT:    v_mov_b32_e32 v31, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
@@ -5250,33 +5242,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s16
-; SDAG-NEXT:    v_mov_b32_e32 v33, s17
-; SDAG-NEXT:    v_mov_b32_e32 v34, s18
-; SDAG-NEXT:    v_mov_b32_e32 v35, s19
-; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
-; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s12
-; SDAG-NEXT:    v_mov_b32_e32 v33, s13
-; SDAG-NEXT:    v_mov_b32_e32 v34, s14
-; SDAG-NEXT:    v_mov_b32_e32 v35, s15
-; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
-; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s8
-; SDAG-NEXT:    v_mov_b32_e32 v33, s9
-; SDAG-NEXT:    v_mov_b32_e32 v34, s10
-; SDAG-NEXT:    v_mov_b32_e32 v35, s11
-; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5287,9 +5265,6 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    v_mov_b32_e32 v32, 25
 ; GISEL-NEXT:    v_mov_b32_e32 v33, 42
 ; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[38:39]
@@ -5321,20 +5296,20 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index ef3bb0cb5f4f1..5475fa2ae5c6e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-SDAG-NEXT:    s_nop 1
-; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    s_nop 6
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32_vgprcd:
@@ -87,14 +87,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, 4.0
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-GISEL-NEXT:    s_nop 1
-; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT:    s_nop 6
-; GFX942-GISEL-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 9db40d2067226..8803f3ae4906f 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -506,13 +506,13 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[6:9], v0, v1, v[2:5]
+; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5]
 ; CHECK-NEXT:    s_nop 3
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[8:9] op_sel:[1,0]
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
-; CHECK-NEXT:    v_accvgpr_write_b32 a2, v9
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, v3
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use a[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -711,18 +711,15 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v12, v31
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1]
-; CHECK-NEXT:    v_and_b32_e32 v12, 0x3ff, v12
-; CHECK-NEXT:    s_nop 2
+; CHECK-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
 ; CHECK-NEXT:    s_nop 8
 ; CHECK-NEXT:    global_store_dwordx2 v[2:3], a[0:1], off
-; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 3, v12
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0
-; CHECK-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5]
-; CHECK-NEXT:    s_nop 5
-; CHECK-NEXT:    global_store_dwordx2 v[4:5], a[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %src2 = call double asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
index e77856d073a0b..a81d9a458e23a 100644
--- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
@@ -311,44 +311,43 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[12:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    global_store_dwordx4 v6, v[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v0, v[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[56:59], s[16:17] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v0, v[56:59], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v0, v[52:55], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v0, v[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v0, v[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v0, v[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v0, v[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v0, v[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[56:59], s[16:17] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v0, a[56:59], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v0, a[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v0, a[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v0, a[52:55], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v0, a[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v0, a[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v0, a[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[8:11], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)

>From 8cabb115a58bf9944f567b2a0d3f17cfb73f6139 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 19 Sep 2025 23:31:46 -0400
Subject: [PATCH 04/20] Resotred SIRegisterInfo files

---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 32 -----------------------
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h   |  4 +--
 2 files changed, 1 insertion(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 819a6c24ecade..ebd2e7ecf249e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3831,38 +3831,6 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
     }
     return false;
   }
-  case AMDGPURI::HasRegisterAvoidanceList: {
-    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    ArrayRef<Register> AvoidRegs = MFI->getRegistersToAvoid(VirtReg);
-
-    if (AvoidRegs.empty())
-      return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
-                                                       MF, VRM);
-    // Collect physical registers to avoid
-    SmallSet<MCPhysReg, 32> AvoidPhysRegs;
-    for (Register AvoidReg : AvoidRegs) {
-      if (VRM && VRM->hasPhys(AvoidReg)) {
-        // Virtual register already mapped - try to avoid its physical register
-        MCPhysReg AvoidPhys = VRM->getPhys(AvoidReg);
-        for (MCRegAliasIterator AI(AvoidPhys, this, true); AI.isValid(); ++AI)
-          AvoidPhysRegs.insert(*AI);
-      }
-    }
-
-    if (AvoidPhysRegs.empty()) {
-      // No physical registers added yet - use default order
-      return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
-                                                       MF, VRM);
-    }
-
-    // Prioritize registers that don't conflict with avoided registers
-    for (MCPhysReg PhysReg : Order) {
-      if (!AvoidPhysRegs.count(PhysReg) && !MRI.isReserved(PhysReg))
-        Hints.push_back(PhysReg);
-    }
-
-    return false;
-  }
   default:
     return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
                                                      VRM);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index ed0c580abc952..7b91ba7bc581f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -31,11 +31,9 @@ class RegisterBank;
 struct SGPRSpillBuilder;
 
 /// Register allocation hint types. Helps eliminate unneeded COPY with True16
-/// HasRegisterAvoidanceList helps with minimizing usage of conflicting physical
-/// registers
 namespace AMDGPURI {
 
-enum { Size16 = 1, Size32 = 2, HasRegisterAvoidanceList = 3 };
+enum { Size16 = 1, Size32 = 2 };
 
 } // end namespace AMDGPURI
 

>From e3d9f7b8ecc3628c1fadffadaf8858b5d6e21179 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 19 Sep 2025 23:33:47 -0400
Subject: [PATCH 05/20] Resotred SIMachineFunctionInfo files

---
 llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 46844d5c3fb87..2c1a13c345aac 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1220,20 +1220,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
 
   AMDGPU::ClusterDimsAttr getClusterDims() const { return ClusterDims; }
-
-  // Map of registers to avoid for a given register
-  DenseMap<Register, SmallVector<Register, 8>> RegisterAvoidanceMap;
-
-  void addRegisterToAvoid(Register VirtReg, Register AvoidReg) {
-    RegisterAvoidanceMap[VirtReg].push_back(AvoidReg);
-  }
-
-  ArrayRef<Register> getRegistersToAvoid(Register VirtReg) const {
-    auto It = RegisterAvoidanceMap.find(VirtReg);
-    if (It != RegisterAvoidanceMap.end())
-      return It->second;
-    return ArrayRef<Register>();
-  }
 };
 
 } // end namespace llvm

>From 44f66f1256d85bb810ad8b742a17418bae83fd30 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Sun, 21 Sep 2025 00:17:36 -0400
Subject: [PATCH 06/20] Updated sources to support anti-hint mechanism

---
 .../include/llvm/CodeGen/MIRParser/MIParser.h |  1 +
 llvm/include/llvm/CodeGen/MIRYamlMapping.h    |  3 +
 .../llvm/CodeGen/MachineRegisterInfo.h        | 56 +++++++++++++++
 llvm/lib/CodeGen/AllocationOrder.cpp          | 68 ++++++++++++++++++-
 llvm/lib/CodeGen/AllocationOrder.h            |  7 ++
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp      | 19 ++++++
 llvm/lib/CodeGen/MIRPrinter.cpp               | 11 +++
 llvm/lib/CodeGen/MachineRegisterInfo.cpp      | 27 ++++++++
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 14 +---
 9 files changed, 192 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 0f2898d3554d0..1d0a745d5f983 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -45,6 +45,7 @@ struct VRegInfo {
   } D;
   Register VReg;
   Register PreferredReg;
+  SmallVector<Register, 4> AntiHints;  // Anti-hints
   uint8_t Flags = 0;
 };
 
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index e80c13885805b..24fac0235e960 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -192,6 +192,7 @@ struct VirtualRegisterDefinition {
   StringValue Class;
   StringValue PreferredRegister;
   std::vector<FlowStringValue> RegisterFlags;
+  std::vector<FlowStringValue> AntiHints;
 
   // TODO: Serialize the target specific register hints.
 
@@ -209,6 +210,8 @@ template <> struct MappingTraits<VirtualRegisterDefinition> {
                        StringValue()); // Don't print out when it's empty.
     YamlIO.mapOptional("flags", Reg.RegisterFlags,
                        std::vector<FlowStringValue>());
+    YamlIO.mapOptional("anti-hints", Reg.AntiHints,
+                       std::vector<FlowStringValue>());  // For anti-hints.
   }
 
   static const bool flow = true;
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 27b30bd5929ff..bcee5d6b30439 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -42,6 +42,7 @@
 namespace llvm {
 
 class PSetIterator;
+class VirtRegMap;
 
 /// Convenient type to represent either a register class or a register bank.
 using RegClassOrRegBank =
@@ -107,6 +108,12 @@ class MachineRegisterInfo {
              VirtReg2IndexFunctor>
       RegAllocHints;
 
+  /// AntiHintRegs - This vector records register anti-hints for
+  /// virtual registers. For each virtual register, it keeps a vector of virtual
+  /// registers that should NOT be allocated to the same or overlapping physical
+  /// registers.
+  IndexedMap<SmallVector<Register, 4>, VirtReg2IndexFunctor> AntiHintRegs;
+
   /// PhysRegUseDefLists - This is an array of the head of the use/def list for
   /// physical registers.
   std::unique_ptr<MachineOperand *[]> PhysRegUseDefLists;
@@ -860,6 +867,55 @@ class MachineRegisterInfo {
     return RegAllocHints.inBounds(VReg) ? &RegAllocHints[VReg] : nullptr;
   }
 
+  /// setRegAllocationAntiHint - Add a register allocation anti-hint for the
+  /// specified virtual register. This tells the allocator to avoid allocating
+  /// VReg to the same physical register as AntiHintVReg (or overlapping ones).
+  void setRegAllocationAntiHint(Register VReg, Register AntiHintVReg) {
+    assert(VReg.isVirtual() && "Anti-hints are only for virtual registers");
+    assert(AntiHintVReg.isVirtual() && "Anti-hint target must be virtual");
+    AntiHintRegs.grow(Register::index2VirtReg(getNumVirtRegs()));
+    auto &AntiHints = AntiHintRegs[VReg];
+    // Avoid duplicates
+    if (llvm::find(AntiHints, AntiHintVReg) == AntiHints.end())
+      AntiHints.push_back(AntiHintVReg);
+  }
+
+  /// addRegAllocationAntiHint - Add multiple anti-hints at once
+  void addRegAllocationAntiHints(Register VReg, ArrayRef<Register> AntiHintVRegs) {
+    for (Register AntiHint : AntiHintVRegs)
+      setRegAllocationAntiHint(VReg, AntiHint);
+  }
+
+  /// clearRegAllocationAntiHints - Clear all anti-hints for a register
+  void clearRegAllocationAntiHints(Register VReg) {
+    assert(VReg.isVirtual());
+    if (AntiHintRegs.inBounds(VReg))
+      AntiHintRegs[VReg].clear();
+  }
+
+  /// getRegAllocationAntiHints - Return the vector of anti-hints for VReg
+  ArrayRef<Register> getRegAllocationAntiHints(Register VReg) const {
+    assert(VReg.isVirtual());
+    if (!AntiHintRegs.inBounds(VReg))
+      return ArrayRef<Register>();
+    return AntiHintRegs[VReg];
+  }
+
+  /// hasRegAllocationAntiHint - Check if VReg has AntiHintVReg as an anti-hint
+  bool hasRegAllocationAntiHint(Register VReg, Register AntiHintVReg) const {
+    assert(VReg.isVirtual() && AntiHintVReg.isVirtual());
+    if (!AntiHintRegs.inBounds(VReg))
+      return false;
+    const auto &AntiHints = AntiHintRegs[VReg];
+    return llvm::find(AntiHints, AntiHintVReg) != AntiHints.end();
+  }
+
+  /// getPhysRegAntiHints - Get the set of physical registers to avoid based on
+  /// anti-hints and current allocations. This is called during allocation.
+  /// VRM is the current virtual register map showing allocations made so far.
+  void getPhysRegAntiHints(Register VReg, SmallVectorImpl<MCPhysReg> &PhysAntiHints,
+                          const VirtRegMap *VRM) const;
+
   /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the
   /// specified register as undefined which causes the DBG_VALUE to be
   /// deleted during LiveDebugVariables analysis.
diff --git a/llvm/lib/CodeGen/AllocationOrder.cpp b/llvm/lib/CodeGen/AllocationOrder.cpp
index 183dc8af1b91b..f57df79128c64 100644
--- a/llvm/lib/CodeGen/AllocationOrder.cpp
+++ b/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -31,6 +31,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
                                         const LiveRegMatrix *Matrix) {
   const MachineFunction &MF = VRM.getMachineFunction();
   const TargetRegisterInfo *TRI = &VRM.getTargetRegInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   auto Order = RegClassInfo.getOrder(MF.getRegInfo().getRegClass(VirtReg));
   SmallVector<MCPhysReg, 16> Hints;
   bool HardHints =
@@ -44,8 +45,69 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
       dbgs() << '\n';
     }
   });
-  assert(all_of(Hints,
-                [&](MCPhysReg Hint) { return is_contained(Order, Hint); }) &&
+
+  // Get anti-hints
+  SmallVector<MCPhysReg, 16> AntiHintedPhysRegs;
+  MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, &VRM);
+  
+  LLVM_DEBUG({
+    if (!AntiHintedPhysRegs.empty()) {
+      dbgs() << "anti-hints:";
+      for (MCPhysReg AntiHint : AntiHintedPhysRegs)
+        dbgs() << ' ' << printReg(AntiHint, TRI);
+      dbgs() << '\n';
+    }
+  });
+  
+  // Create allocation order object
+  AllocationOrder AO(std::move(Hints), Order, HardHints);
+  
+  // Apply anti-hint filtering if needed
+  if (!AntiHintedPhysRegs.empty()) {
+    AO.applyAntiHints(AntiHintedPhysRegs, TRI);
+    
+    LLVM_DEBUG({
+      if (!AO.Hints.empty()) {
+        dbgs() << "filtered hints:";
+        for (MCPhysReg Hint : AO.Hints)
+          dbgs() << ' ' << printReg(Hint, TRI);
+        dbgs() << '\n';
+      }
+    });
+  }
+
+
+  assert(all_of(AO.Hints,
+                [&](MCPhysReg Hint) { return is_contained(AO.Order, Hint); }) &&
          "Target hint is outside allocation order.");
-  return AllocationOrder(std::move(Hints), Order, HardHints);
+  return AO;
+}
+
+void AllocationOrder::applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs, 
+                                     const TargetRegisterInfo *TRI) {
+  // Create filtered order
+  FilteredOrderStorage.clear();
+  FilteredOrderStorage.reserve(Order.size());
+  
+  // Add non-anti-hinted registers first
+  for (MCPhysReg PhysReg : Order) {
+    if (!is_contained(AntiHintedPhysRegs, PhysReg)) {
+      FilteredOrderStorage.push_back(PhysReg);
+    }
+  }
+  
+  // Add anti-hinted registers at the end as last resort
+  for (MCPhysReg PhysReg : Order) {
+    if (is_contained(AntiHintedPhysRegs, PhysReg)) {
+      FilteredOrderStorage.push_back(PhysReg);
+    }
+  }
+  
+  // Update Order to point to our filtered storage
+  Order = FilteredOrderStorage;
+  
+  LLVM_DEBUG({
+    dbgs() << "moved " << AntiHintedPhysRegs.size() 
+           << " anti-hinted registers to end of allocation order\n";
+  });
 }
diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h
index 3dd02c3b14d3a..842f83d957a6d 100644
--- a/llvm/lib/CodeGen/AllocationOrder.h
+++ b/llvm/lib/CodeGen/AllocationOrder.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 
 namespace llvm {
 
@@ -29,6 +30,7 @@ class LiveRegMatrix;
 
 class LLVM_LIBRARY_VISIBILITY AllocationOrder {
   const SmallVector<MCPhysReg, 16> Hints;
+  SmallVector<MCPhysReg, 16> FilteredOrderStorage;
   ArrayRef<MCPhysReg> Order;
   // How far into the Order we can iterate. This is 0 if the AllocationOrder is
   // constructed with HardHints = true, Order.size() otherwise. While
@@ -117,6 +119,11 @@ class LLVM_LIBRARY_VISIBILITY AllocationOrder {
                static_cast<uint32_t>(std::numeric_limits<MCPhysReg>::max()));
     return Reg.isPhysical() && is_contained(Hints, Reg.id());
   }
+  
+  /// Apply antihint to the allocation order.
+  void applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs, 
+                      const TargetRegisterInfo *TRI);
+
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 0f792b0ef206c..d63f8040de331 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -735,6 +735,20 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
                          FlagStringValue.Value + "'");
       Info.Flags |= FlagValue;
     }
+
+    for (const auto &AntiHintValue : VReg.AntiHints) {
+      if (Info.Kind != VRegInfo::NORMAL)
+        return error(VReg.Class.SourceRange.Start,
+              Twine("anti-hints can only be set for normal vregs"));
+
+      Register AntiHintReg;
+      if (parseRegisterReference(PFS, AntiHintReg,
+                                 AntiHintValue.Value, Error))
+        return error(Error, AntiHintValue.SourceRange);
+      
+      Info.AntiHints.push_back(AntiHintReg);
+    }
+
     RegInfo.noteNewVirtualRegister(Info.VReg);
   }
 
@@ -801,6 +815,11 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS,
       MRI.setRegClass(Reg, Info.D.RC);
       if (Info.PreferredReg != 0)
         MRI.setSimpleHint(Reg, Info.PreferredReg);
+
+      for (Register AntiHint : Info.AntiHints) {
+        if (AntiHint != 0)
+          MRI.setRegAllocationAntiHint(Reg, AntiHint);
+      }
       break;
     case VRegInfo::GENERIC:
       break;
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 1d54d72336860..f263bb22e800b 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -312,6 +312,17 @@ static void convertMRI(yaml::MachineFunction &YamlMF, const MachineFunction &MF,
     if (PreferredReg)
       printRegMIR(PreferredReg, VReg.PreferredRegister, TRI);
     printRegFlags(Reg, VReg.RegisterFlags, MF, TRI);
+    // Print the anti-hints.
+    const auto &AntiHints = RegInfo.getRegAllocationAntiHints(Reg);
+    if (!AntiHints.empty()) {
+      std::vector<yaml::FlowStringValue> AntiHintStrings;
+      for (Register AntiHint : AntiHints) {
+        yaml::FlowStringValue AntiHintStr;
+        printRegMIR(AntiHint, AntiHintStr, TRI);
+        AntiHintStrings.push_back(std::move(AntiHintStr));
+      }
+      VReg.AntiHints = std::move(AntiHintStrings);
+    }
     YamlMF.VirtualRegisters.push_back(std::move(VReg));
   }
 
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index ae284f3ae2929..bbf03830b3bd5 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DebugLoc.h"
@@ -674,3 +675,29 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
   }
   return false;
 }
+
+void MachineRegisterInfo::getPhysRegAntiHints(Register VReg, 
+                                             SmallVectorImpl<MCPhysReg> &PhysAntiHints,
+                                             const VirtRegMap *VRM) const {
+  assert(VReg.isVirtual());
+  if (!AntiHintRegs.inBounds(VReg) || !VRM)
+    return;
+  
+  const auto &AntiHints = AntiHintRegs[VReg];
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  
+  for (Register AntiHintVReg : AntiHints) {
+    // Check if the anti-hinted register has been allocated
+    if (VRM->hasPhys(AntiHintVReg)) {
+      MCPhysReg PhysReg = VRM->getPhys(AntiHintVReg);
+      // Add the physical register and all its aliases
+      for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) {
+        PhysAntiHints.push_back(*AI);
+      }
+    }
+  }
+  
+  // Remove duplicates
+  llvm::sort(PhysAntiHints);
+  PhysAntiHints.erase(llvm::unique(PhysAntiHints), PhysAntiHints.end());
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index ed349fccfa3e4..1a8cd84f7640a 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -323,17 +323,9 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
               // Check if MFMA register is dead at current instruction
               const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg);
               if (!MFMAInterval.liveAt(CurrentSlot)) {
-
-                // Add bidirectional avoidance hint
-                MFI->addRegisterToAvoid(CandidateReg, MFMAReg);
-                MFI->addRegisterToAvoid(MFMAReg, CandidateReg);
-
-                // Set hint if we found registers to avoid
-                MRI->setRegAllocationHint(
-                    MFMAReg, AMDGPURI::HasRegisterAvoidanceList, Register());
-                MRI->setRegAllocationHint(CandidateReg,
-                                          AMDGPURI::HasRegisterAvoidanceList,
-                                          Register());
+                // Add bidirectional antihints
+                MRI->addRegAllocationAntiHints(CandidateReg, MFMARegs);
+                MRI->addRegAllocationAntiHints(MFMAReg, CandidateReg);
               }
             }
           }

>From 66dad59f063e277eaf23e167bd733b1836e4e212 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Sun, 21 Sep 2025 01:58:40 -0400
Subject: [PATCH 07/20] Made anti-hints map conditional in MIRYamlMapping

---
 llvm/include/llvm/CodeGen/MIRYamlMapping.h       | 6 ++++--
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index 24fac0235e960..20cc3c370dc66 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -210,8 +210,10 @@ template <> struct MappingTraits<VirtualRegisterDefinition> {
                        StringValue()); // Don't print out when it's empty.
     YamlIO.mapOptional("flags", Reg.RegisterFlags,
                        std::vector<FlowStringValue>());
-    YamlIO.mapOptional("anti-hints", Reg.AntiHints,
-                       std::vector<FlowStringValue>());  // For anti-hints.
+    if(!YamlIO.outputting() || !Reg.AntiHints.empty()) {  // Only map when parsing or anti-hints present
+      YamlIO.mapOptional("anti-hints", Reg.AntiHints,
+                       std::vector<FlowStringValue>());  // for anti-hints
+    }
   }
 
   static const bool flow = true;
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 1a8cd84f7640a..f63eea716d68b 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -48,7 +48,7 @@ static cl::opt<bool> EnableRegisterAvoidListForMFMARegs(
     "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden,
     cl::desc("Enable Register Avoidance for "
              "MFMA in GCNPreRAOptimizations stage."),
-    cl::init(false));
+    cl::init(true));
 
 namespace {
 

>From eb9404da2dd7251aef47980c5cbce12ad52c168a Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Sun, 21 Sep 2025 02:14:53 -0400
Subject: [PATCH 08/20] Updated tests

---
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir |   21 +-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir |  542 ++++-----
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll |  116 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll |  120 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll |  278 +++--
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  |  150 +--
 ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll |  271 +++--
 .../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll    |   12 +-
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       | 1071 +++++++++++------
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       |   20 +-
 .../unspill-vgpr-after-rewrite-vgpr-mfma.ll   |  170 ++-
 11 files changed, 1614 insertions(+), 1157 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index 01b24dfd79941..8fbfe2e591dfe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -15,9 +15,12 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr106
   ; GCN-NEXT:    ; implicit-def: $vgpr132
+  ; GCN-NEXT:    ; implicit-def: $vgpr112
+  ; GCN-NEXT:    ; implicit-def: $vgpr113
+  ; GCN-NEXT:    ; implicit-def: $vgpr114
+  ; GCN-NEXT:    ; implicit-def: $vgpr115
   ; GCN-NEXT:    ; implicit-def: $vgpr133
   ; GCN-NEXT:    ; implicit-def: $vgpr139
-  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    ; implicit-def: $vgpr16
@@ -222,6 +225,9 @@
   ; GCN-NEXT:    buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
+  ; GCN-NEXT:    v_add_u32_e32 v73, v132, v112
   ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79]
   ; GCN-NEXT:    buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1
@@ -236,15 +242,12 @@
   ; GCN-NEXT:    buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr74
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v74
-  ; GCN-NEXT:    ; implicit-def: $vgpr75
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v114
   ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v75
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v115
   ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -252,6 +255,8 @@
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; kill: killed $vgpr73
   ; GCN-NEXT:    ds_read_b128 v[72:75], v94
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79]
   ; GCN-NEXT:    v_perm_b32 v238, v162, v160, s5
@@ -262,9 +267,11 @@
   ; GCN-NEXT:    ds_read_b128 v[160:163], v213
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ; implicit-def: $sgpr8
+  ; GCN-NEXT:    ; implicit-def: $vgpr112
+  ; GCN-NEXT:    ; implicit-def: $vgpr113
+  ; GCN-NEXT:    ; implicit-def: $vgpr114
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
   ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0887fdf0844b0..be97a1e82fcf2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -10,25 +10,24 @@
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
   ; GCN-NEXT:    v_readfirstlane_b32 s20, v2
   ; GCN-NEXT:    ; implicit-def: $sgpr4
-  ; GCN-NEXT:    ; implicit-def: $vgpr3
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
   ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:    ; implicit-def: $vgpr50
+  ; GCN-NEXT:    ; implicit-def: $vgpr76
   ; GCN-NEXT:    ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; implicit-def: $vgpr49
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
-  ; GCN-NEXT:    ; implicit-def: $vgpr51
-  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
-  ; GCN-NEXT:    ; implicit-def: $vgpr76
+  ; GCN-NEXT:    ; implicit-def: $vgpr50
   ; GCN-NEXT:    ; implicit-def: $vgpr77
   ; GCN-NEXT:    ; implicit-def: $vgpr78
   ; GCN-NEXT:    ; implicit-def: $vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr80
-  ; GCN-NEXT:    ; implicit-def: $vgpr91
+  ; GCN-NEXT:    ; implicit-def: $vgpr81
+  ; GCN-NEXT:    ; implicit-def: $vgpr103
   ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v3
+  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v64
   ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
   ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -36,8 +35,9 @@
   ; GCN-NEXT:    s_lshl_b32 s4, s20, 7
   ; GCN-NEXT:    ; implicit-def: $vgpr5
   ; GCN-NEXT:    v_add_lshl_u32 v48, v5, s4, 1
-  ; GCN-NEXT:    v_add_u32_e32 v76, s20, v76
-  ; GCN-NEXT:    v_and_b32_e32 v76, 0x1fffffff, v76
+  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
+  ; GCN-NEXT:    v_add_u32_e32 v77, s20, v77
+  ; GCN-NEXT:    v_and_b32_e32 v77, 0x1fffffff, v77
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b128 v48, v[0:3]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -48,8 +48,8 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   ; GCN-NEXT:    ; implicit-def: $sgpr6
-  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v50
-  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v50
+  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v76
+  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v76
   ; GCN-NEXT:    buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -68,22 +68,22 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
   ; GCN-NEXT:    ; kill: killed $vgpr1
   ; GCN-NEXT:    ; kill: killed $vgpr0
-  ; GCN-NEXT:    v_mul_lo_u32 v76, v76, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v76, v77, v76, 1
-  ; GCN-NEXT:    v_lshl_add_u32 v77, v78, 1, v76
-  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    v_mul_lo_u32 v77, v77, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v77, v78, v77, 1
   ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
   ; GCN-NEXT:    ; implicit-def: $sgpr2
   ; GCN-NEXT:    ; implicit-def: $sgpr3
-  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
+  ; GCN-NEXT:    v_lshl_add_u32 v80, v81, 1, v79
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v51
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v50
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v51 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v50 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
@@ -107,20 +107,20 @@
   ; GCN-NEXT:    ds_read_b128 v[40:43], v49 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v51
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v50
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
   ; GCN-NEXT:    ; implicit-def: $vgpr32
   ; GCN-NEXT:    ; implicit-def: $vgpr33
-  ; GCN-NEXT:    v_add_u32_e32 v82, v32, v50
-  ; GCN-NEXT:    v_add_u32_e32 v83, v33, v50
-  ; GCN-NEXT:    ; kill: killed $vgpr82
+  ; GCN-NEXT:    v_add_u32_e32 v83, v32, v76
+  ; GCN-NEXT:    v_add_u32_e32 v76, v33, v76
   ; GCN-NEXT:    ; kill: killed $vgpr83
+  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[66:69], v51 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[66:69], v50 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
@@ -131,20 +131,20 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
   ; GCN-NEXT:    ; implicit-def: $vgpr66
   ; GCN-NEXT:    ; implicit-def: $vgpr67
-  ; GCN-NEXT:    v_max_f32_e32 v81, v67, v67
+  ; GCN-NEXT:    v_max_f32_e32 v82, v67, v67
   ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
   ; GCN-NEXT:    v_perm_b32 v70, v74, v72, s2
   ; GCN-NEXT:    v_perm_b32 v71, v74, v72, s3
   ; GCN-NEXT:    v_perm_b32 v72, v75, v73, s2
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v76, v70
+  ; GCN-NEXT:    ds_write_b32 v77, v70
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v77, v71
+  ; GCN-NEXT:    ds_write_b32 v78, v71
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v72
+  ; GCN-NEXT:    ds_write_b32 v79, v72
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v20
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
   ; GCN-NEXT:    v_mul_f32_e32 v64, s4, v16
@@ -152,11 +152,11 @@
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
   ; GCN-NEXT:    v_max3_f32 v64, v64, s5, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v21
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v22
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v23
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v24
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v25
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
@@ -166,12 +166,12 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v28
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v29
   ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v30
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v31
   ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v0
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v1
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v80, v84
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v81, v84
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v2
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v3
   ; GCN-NEXT:    v_max3_f32 v64, v64, v85, v86
@@ -179,315 +179,315 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v5
   ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v65
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v7
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v7
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v8
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v9
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v10
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v11
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v12
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v13
   ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
-  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
   ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v68
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
   ; GCN-NEXT:    v_perm_b32 v68, v75, v73, s3
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v68
-  ; GCN-NEXT:    ; implicit-def: $vgpr84
-  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
-  ; GCN-NEXT:    v_max_f32_e32 v70, v64, v65
+  ; GCN-NEXT:    ds_write_b32 v80, v68
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[70:71], v76, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_bpermute_b32 v71, v66, v70
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; implicit-def: $vgpr87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v70, v71, v70, s[0:1]
-  ; GCN-NEXT:    v_max_f32_e32 v70, v70, v70
-  ; GCN-NEXT:    v_max_f32_e32 v72, v81, v70
-  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v72
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v72
-  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v72
+  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[0:1]
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
+  ; GCN-NEXT:    v_max_f32_e32 v65, v82, v64
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v65
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v65
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v65
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v65
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
-  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v72
-  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v72
-  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v72
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v73, v16
-  ; GCN-NEXT:    v_exp_f32_e32 v74, v18
-  ; GCN-NEXT:    v_exp_f32_e32 v75, v19
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v65
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v65
+  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v65
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v17
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v19
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
   ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v73
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v81, v21
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v74
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v82, v22
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v75
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_sub_f32_e32 v24, v67, v72
-  ; GCN-NEXT:    v_exp_f32_e32 v83, v23
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v72
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v24, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v73
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v25, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v81
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v65
   ; GCN-NEXT:    v_exp_f32_e32 v85, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
-  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
-  ; GCN-NEXT:    ds_read_b128 v[18:21], v84
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v82
+  ; GCN-NEXT:    v_pack_b32_f16 v24, v16, v18
+  ; GCN-NEXT:    v_sub_f32_e32 v22, v67, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+  ; GCN-NEXT:    v_pack_b32_f16 v25, v20, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v19
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v16, v24
-  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v86, v23
+  ; GCN-NEXT:    v_exp_f32_e32 v64, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[16:17], v[24:25], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v16, 0, v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v83
+  ; GCN-NEXT:    v_fma_f32 v88, s4, v28, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v84
+  ; GCN-NEXT:    v_fma_f32 v91, s4, v29, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v21
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v87 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
-  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
-  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
-  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v17, v18
-  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v30, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v30, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v31, -v72
-  ; GCN-NEXT:    v_perm_b32 v31, v68, v64, s2
-  ; GCN-NEXT:    v_perm_b32 v64, v68, v64, s3
-  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
-  ; GCN-NEXT:    ds_read_b128 v[26:29], v91
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_perm_b32 v99, v70, v68, s2
+  ; GCN-NEXT:    v_perm_b32 v100, v70, v68, s3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[24:25], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v93, v73, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v94, v85
+  ; GCN-NEXT:    v_fma_f32 v95, s4, v30, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v96, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v97, v86
+  ; GCN-NEXT:    v_fma_f32 v98, s4, v31, -v65
+  ; GCN-NEXT:    v_perm_b32 v101, v71, v69, s2
+  ; GCN-NEXT:    v_perm_b32 v102, v71, v69, s3
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v91 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v76, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v31, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_exp_f32_e32 v31, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_pack_b32_f16 v18, v19, v86
-  ; GCN-NEXT:    v_pack_b32_f16 v19, v22, v89
+  ; GCN-NEXT:    ds_write_b32 v77, v99
+  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v76, v76, v90
+  ; GCN-NEXT:    v_pack_b32_f16 v77, v94, v97
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v77, v64
+  ; GCN-NEXT:    ds_write_b32 v78, v100
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v90
+  ; GCN-NEXT:    ds_write_b32 v79, v101
+  ; GCN-NEXT:    v_mul_f32_e32 v78, 0x3fb8aa3b, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v79, 0x3fb8aa3b, v91
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v81, v81, v93
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v89
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v91, v78
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v78, v92
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v93, v79
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[76:77], v[32:47]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v73
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v87
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v74, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v85
-  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v22, v64
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v88
-  ; GCN-NEXT:    v_exp_f32_e32 v64, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v75, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v30
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v23, v23
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v31
+  ; GCN-NEXT:    ds_write_b32 v80, v102
+  ; GCN-NEXT:    v_mul_f32_e32 v80, 0x3fb8aa3b, v95
+  ; GCN-NEXT:    v_add_f32_e32 v76, v82, v81
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v96
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v80
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v79, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v98
+  ; GCN-NEXT:    v_fma_f32 v81, s4, v3, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v88
   ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v20, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v18, v19
-  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v80, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v22
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v4, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v27, v3
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v64
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v5, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v90, v78
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v77, v79
   ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-  ; GCN-NEXT:    v_add_f32_e32 v17, v81, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v23
-  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v68, v2
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v25
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v83, v76
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v91
+  ; GCN-NEXT:    v_fma_f32 v83, s4, v4, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v90, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v93
+  ; GCN-NEXT:    v_fma_f32 v94, s4, v5, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v88
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v84, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v80
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v82
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v69, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v81
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v84
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v18, v4
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v19
-  ; GCN-NEXT:    v_exp_f32_e32 v24, v24
-  ; GCN-NEXT:    ds_read_b128 v[18:21], v84 offset:576
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v73
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v69
+  ; GCN-NEXT:    ds_read_b128 v[76:79], v87 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v26, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v82, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v27
-  ; GCN-NEXT:    v_exp_f32_e32 v26, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v65
-  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v83
+  ; GCN-NEXT:    v_mul_f32_e32 v81, 0x3fb8aa3b, v94
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[4:5], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v85, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v70, v90
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v71, v69
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v88
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v81
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[4:5], v[32:47]
   ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v83, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v68
-  ; GCN-NEXT:    v_exp_f32_e32 v6, v6
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v24
+  ; GCN-NEXT:    v_add_f32_e32 v68, v86, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v72
+  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v6
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v73
   ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v7
-  ; GCN-NEXT:    v_exp_f32_e32 v7, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v28, v29
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v69
-  ; GCN-NEXT:    ; implicit-def: $sgpr2
-  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_fma_f32 v75, s4, v11, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v70, v69
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v9
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v85, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v4, v88, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v89, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v71
+  ; GCN-NEXT:    v_fma_f32 v70, s4, v12, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v81
+  ; GCN-NEXT:    v_fma_f32 v86, s4, v13, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v87, v8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v76, v92, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v6
-  ; GCN-NEXT:    v_exp_f32_e32 v10, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v17, v28
-  ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v30, v4
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v31, v2
-  ; GCN-NEXT:    v_add_f32_e32 v0, v22, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v64, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v23, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v25, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v27, v0
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
-  ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
-  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
-  ; GCN-NEXT:    v_add_f32_e32 v0, v68, v0
-  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
-  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
-  ; GCN-NEXT:    v_add_f32_e32 v0, v24, v0
-  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
-  ; GCN-NEXT:    v_add_f32_e32 v0, v26, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v67, v0
-  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v5
-  ; GCN-NEXT:    v_add_f32_e32 v0, v6, v0
-  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
-  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v1
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v14
-  ; GCN-NEXT:    v_add_f32_e32 v0, v7, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
-  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
-  ; GCN-NEXT:    v_exp_f32_e32 v18, v1
-  ; GCN-NEXT:    v_add_f32_e32 v6, v8, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v91
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v74
+  ; GCN-NEXT:    v_fma_f32 v77, s4, v14, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v92, v83
+  ; GCN-NEXT:    v_pack_b32_f16 v68, v68, v85
+  ; GCN-NEXT:    v_mul_f32_e32 v75, 0x3fb8aa3b, v75
+  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v70
+  ; GCN-NEXT:    v_pack_b32_f16 v69, v69, v92
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v15, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v75
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v76, v96, v76
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v84
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v86
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v87
+  ; GCN-NEXT:    v_exp_f32_e32 v94, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[68:69], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v67, v67, v76
+  ; GCN-NEXT:    v_add_f32_e32 v67, v91, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v93, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v80, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v82, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v90, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v88, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v72, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v68, 0x3fb8aa3b, v77
+  ; GCN-NEXT:    v_add_f32_e32 v67, v73, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v89
+  ; GCN-NEXT:    v_exp_f32_e32 v78, v68
+  ; GCN-NEXT:    v_add_f32_e32 v67, v71, v67
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
-  ; GCN-NEXT:    v_add_f32_e32 v6, v9, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v13
-  ; GCN-NEXT:    v_add_f32_e32 v6, v10, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v15
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v12
-  ; GCN-NEXT:    v_add_f32_e32 v6, v11, v6
-  ; GCN-NEXT:    v_add_f32_e32 v6, v12, v6
-  ; GCN-NEXT:    v_add_f32_e32 v1, v15, v6
-  ; GCN-NEXT:    v_add_f32_e32 v11, v17, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v0, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v4, v10
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v91 offset:576
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v75
+  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
+  ; GCN-NEXT:    v_add_f32_e32 v67, v81, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v74, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v77, v76, v77
+  ; GCN-NEXT:    v_pack_b32_f16 v76, v85, v86
+  ; GCN-NEXT:    v_add_f32_e32 v67, v83, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v72, v65
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v94
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v78
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v74, v92
+  ; GCN-NEXT:    v_add_f32_e32 v67, v84, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v87, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v89, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v75, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v69, v68, v72
+  ; GCN-NEXT:    v_pack_b32_f16 v68, v74, v73
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v67, v92, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v94, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v78, v67
+  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
+  ; GCN-NEXT:    ds_bpermute_b32 v67, v66, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
+  ; GCN-NEXT:    ds_bpermute_b32 v66, v66, v65
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v18, v11
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    v_mov_b32_e32 v67, 0
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
-  ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
+  ; GCN-NEXT:    v_cndmask_b32_e64 v65, v66, v65, s[0:1]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_fmac_f32_e32 v65, v67, v64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[68:69], v[32:47]
   ; GCN-NEXT:    s_endpgm
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index ed3d1399e5926..17692a38dfc64 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -427,37 +427,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
 ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 4
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -647,37 +647,37 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f32_16x16x16bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 10
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1298,26 +1298,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
@@ -1326,26 +1326,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0)
@@ -1645,8 +1645,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
@@ -1673,8 +1673,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
@@ -1759,8 +1759,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
@@ -1787,8 +1787,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index 2fb677eccc4b3..07a4f33f25b17 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -2460,6 +2460,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -2480,12 +2481,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 9
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_nop 10
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
@@ -2525,6 +2525,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -2545,12 +2546,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 10
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_nop 11
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
@@ -3607,6 +3607,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3627,12 +3628,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 9
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_nop 10
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
@@ -3672,6 +3672,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3692,12 +3693,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 10
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_nop 11
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
@@ -3910,6 +3910,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3930,12 +3931,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 9
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_nop 10
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
@@ -3975,6 +3975,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3995,12 +3996,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 10
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_nop 11
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
@@ -4213,6 +4213,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4233,12 +4234,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 9
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_nop 10
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
@@ -4278,6 +4278,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4298,12 +4299,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 10
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_nop 11
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
@@ -4516,6 +4516,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4536,12 +4537,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 9
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_nop 10
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
@@ -4581,6 +4581,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4601,12 +4602,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 10
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_nop 11
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index ab0000f6831b6..eefd7b5fea63e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -3182,18 +3182,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31]
-; VGPRRC-NEXT:    s_nop 11
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], 16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], 0
+; VGPRRC-NEXT:    s_nop 9
 ; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3214,14 +3212,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3594,18 +3592,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1
-; VGPRRC-NEXT:    s_nop 11
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], 16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], 0
+; VGPRRC-NEXT:    s_nop 9
 ; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3626,14 +3622,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
@@ -4146,33 +4142,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4256,33 +4251,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; HEURRC-NEXT:    s_nop 6
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4320,33 +4314,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT:    s_nop 6
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4523,33 +4516,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4633,33 +4625,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT:    s_nop 6
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4697,33 +4688,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT:    s_nop 6
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 7e30af96bb8b9..aa670dce4e6f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -799,17 +799,17 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1155,17 +1155,17 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 9
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -2005,21 +2005,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -2395,21 +2395,21 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -3304,17 +3304,17 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
@@ -3494,19 +3494,19 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1)
 ;
 ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
 ; GFX942-VGPR:       ; %bb.0:
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x41
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3)
@@ -4309,7 +4309,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4318,9 +4318,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v6, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -5017,12 +5017,12 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v0, v1, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -5542,6 +5542,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v1
@@ -5570,39 +5572,37 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[30:31]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[30:31], v[28:29]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[28:29], v[26:27]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[26:27], v[24:25]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[24:25], v[22:23]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[22:23], v[20:21]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[20:21], v[18:19]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], v[16:17]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[62:63], v[30:31]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v64, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[60:61], v[28:29]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[58:59], v[26:27]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[56:57], v[24:25]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[54:55], v[22:23]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[52:53], v[20:21]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[50:51], v[18:19]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[48:49], v[16:17]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[46:47], v[14:15]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[44:45], v[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[42:43], v[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[40:41], v[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[38:39], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[36:37], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[34:35], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[0:1]
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[32:63], v0, v64, v[32:63]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[60:63], s[0:1] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[56:59], s[0:1] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[52:55], s[0:1] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[48:51], s[0:1] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[44:47], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[40:43], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[36:39], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[32:35], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
@@ -5695,20 +5695,20 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -5804,19 +5804,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index f0205a3a788ed..a8d2f64c3c4d9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -5093,43 +5093,42 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; SDAG-NEXT:    s_nop 14
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[20:21], 48
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[36:37], 48
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[24:25], 16
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[38:39], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5137,6 +5136,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
@@ -5154,28 +5156,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], 48
+; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
@@ -5190,71 +5197,71 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x0
 ; SDAG-NEXT:    v_mov_b32_e32 v32, 42
 ; SDAG-NEXT:    v_mov_b32_e32 v33, 25
+; SDAG-NEXT:    v_mov_b64_e32 v[36:37], 48
+; SDAG-NEXT:    v_mov_b64_e32 v[38:39], 32
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    v_mov_b32_e32 v20, s16
-; SDAG-NEXT:    v_mov_b32_e32 v21, s17
-; SDAG-NEXT:    v_mov_b32_e32 v22, s18
-; SDAG-NEXT:    v_mov_b32_e32 v23, s19
-; SDAG-NEXT:    v_mov_b32_e32 v24, s20
-; SDAG-NEXT:    v_mov_b32_e32 v25, s21
-; SDAG-NEXT:    v_mov_b32_e32 v26, s22
-; SDAG-NEXT:    v_mov_b32_e32 v27, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s12
+; SDAG-NEXT:    v_mov_b32_e32 v1, s13
+; SDAG-NEXT:    v_mov_b32_e32 v2, s14
+; SDAG-NEXT:    v_mov_b32_e32 v3, s15
+; SDAG-NEXT:    v_mov_b32_e32 v4, s16
+; SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; SDAG-NEXT:    v_mov_b32_e32 v6, s18
+; SDAG-NEXT:    v_mov_b32_e32 v7, s19
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
+; SDAG-NEXT:    v_mov_b32_e32 v9, s21
+; SDAG-NEXT:    v_mov_b32_e32 v10, s22
+; SDAG-NEXT:    v_mov_b32_e32 v11, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v28, s24
-; SDAG-NEXT:    v_mov_b32_e32 v29, s25
-; SDAG-NEXT:    v_mov_b32_e32 v30, s26
-; SDAG-NEXT:    v_mov_b32_e32 v31, s27
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v13, s25
+; SDAG-NEXT:    v_mov_b32_e32 v14, s26
+; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[20:21], 48
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[24:25], 16
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
+; SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5264,52 +5271,52 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; GISEL-NEXT:    v_mov_b32_e32 v32, 25
 ; GISEL-NEXT:    v_mov_b32_e32 v33, 42
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[36:37]
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[38:39]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[40:41]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[42:43]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[44:45]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[46:47]
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[48:49]
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[50:51]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[44:45]
 ; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
 ; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[18:19]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], 48
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 5475fa2ae5c6e..ef3bb0cb5f4f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-SDAG-NEXT:    s_nop 1
-; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    s_nop 6
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32_vgprcd:
@@ -87,14 +87,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, 4.0
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-GISEL-NEXT:    s_nop 1
-; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-GISEL-NEXT:    s_nop 5
-; GFX942-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    s_nop 6
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 0c1448a0b8fb6..da46ade4401f2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -245,24 +245,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16:
@@ -307,24 +324,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
@@ -369,24 +403,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
@@ -672,24 +723,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
 ; GCN-NEXT:    s_nop 11
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -699,24 +767,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; GCN-NEXT:    s_nop 11
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
@@ -726,24 +811,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; GCN-NEXT:    s_nop 11
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
@@ -1042,24 +1144,41 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8:
@@ -1104,24 +1223,41 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
@@ -1166,24 +1302,41 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
@@ -2049,24 +2202,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
@@ -2111,24 +2281,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
@@ -2173,24 +2360,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
@@ -2400,24 +2604,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
@@ -2462,24 +2683,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
@@ -2524,24 +2762,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
@@ -2751,24 +3006,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
@@ -2813,24 +3085,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
@@ -2875,24 +3164,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
@@ -3102,24 +3408,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
@@ -3164,24 +3487,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
@@ -3226,24 +3566,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 8803f3ae4906f..6383bfb65d364 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -506,13 +506,13 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5]
+; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[6:9], v0, v1, v[2:5]
 ; CHECK-NEXT:    s_nop 3
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[8:9] op_sel:[1,0]
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
-; CHECK-NEXT:    v_accvgpr_write_b32 a2, v3
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, v9
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use a[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -711,15 +711,15 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_and_b32_e32 v12, 0x3ff, v31
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1]
-; CHECK-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
+; CHECK-NEXT:    s_nop 3
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
-; CHECK-NEXT:    s_nop 8
-; CHECK-NEXT:    global_store_dwordx2 v[2:3], a[0:1], off
+; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 3, v12
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5]
+; CHECK-NEXT:    s_nop 5
+; CHECK-NEXT:    global_store_dwordx2 v[4:5], a[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %src2 = call double asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
index a81d9a458e23a..08f89b32edb20 100644
--- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
@@ -101,8 +101,13 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a2
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a3
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[6:9]
+; CHECK-NEXT:    ; def v[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
@@ -112,37 +117,75 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[56:59], s[16:17] offset:96
+; CHECK-NEXT:    v_accvgpr_read_b32 v0, a32
+; CHECK-NEXT:    v_mov_b32_e32 v60, 0
+; CHECK-NEXT:    v_accvgpr_read_b32 v24, a56
+; CHECK-NEXT:    v_accvgpr_read_b32 v25, a57
+; CHECK-NEXT:    v_accvgpr_read_b32 v26, a58
+; CHECK-NEXT:    v_accvgpr_read_b32 v27, a59
+; CHECK-NEXT:    global_store_dwordx4 v60, v[56:59], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v60, v[52:55], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v60, v[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v60, v[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v60, v[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v60, v[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v60, v[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[56:59], s[16:17] offset:96
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a33
+; CHECK-NEXT:    v_accvgpr_read_b32 v2, a34
+; CHECK-NEXT:    v_accvgpr_read_b32 v3, a35
+; CHECK-NEXT:    v_accvgpr_read_b32 v4, a36
+; CHECK-NEXT:    v_accvgpr_read_b32 v5, a37
+; CHECK-NEXT:    v_accvgpr_read_b32 v6, a38
+; CHECK-NEXT:    v_accvgpr_read_b32 v7, a39
+; CHECK-NEXT:    v_accvgpr_read_b32 v8, a40
+; CHECK-NEXT:    v_accvgpr_read_b32 v9, a41
+; CHECK-NEXT:    v_accvgpr_read_b32 v10, a42
+; CHECK-NEXT:    v_accvgpr_read_b32 v11, a43
+; CHECK-NEXT:    v_accvgpr_read_b32 v12, a44
+; CHECK-NEXT:    v_accvgpr_read_b32 v13, a45
+; CHECK-NEXT:    v_accvgpr_read_b32 v14, a46
+; CHECK-NEXT:    v_accvgpr_read_b32 v15, a47
+; CHECK-NEXT:    v_accvgpr_read_b32 v16, a48
+; CHECK-NEXT:    v_accvgpr_read_b32 v17, a49
+; CHECK-NEXT:    v_accvgpr_read_b32 v18, a50
+; CHECK-NEXT:    v_accvgpr_read_b32 v19, a51
+; CHECK-NEXT:    v_accvgpr_read_b32 v20, a52
+; CHECK-NEXT:    v_accvgpr_read_b32 v21, a53
+; CHECK-NEXT:    v_accvgpr_read_b32 v22, a54
+; CHECK-NEXT:    v_accvgpr_read_b32 v23, a55
+; CHECK-NEXT:    v_accvgpr_read_b32 v28, a60
+; CHECK-NEXT:    v_accvgpr_read_b32 v29, a61
+; CHECK-NEXT:    v_accvgpr_read_b32 v30, a62
+; CHECK-NEXT:    v_accvgpr_read_b32 v31, a63
+; CHECK-NEXT:    global_store_dwordx4 v60, v[24:27], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v60, v[28:31], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v60, v[16:19], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v60, v[20:23], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v60, v[8:11], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v60, v[12:15], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v60, v[0:3], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v60, v[4:7], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[6:9], s[16:17]
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v60, v[0:3], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -301,16 +344,26 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
 ; CHECK-NEXT:    v_accvgpr_write_b32 a33, v1
 ; CHECK-NEXT:    v_accvgpr_write_b32 a32, v0
 ; CHECK-NEXT:    v_accvgpr_read_b32 v7, a3
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_accvgpr_read_b32 v6, a2
 ; CHECK-NEXT:    v_accvgpr_read_b32 v5, a1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v4, a0
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[8:11]
+; CHECK-NEXT:    ; def v[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[12:15]
+; CHECK-NEXT:    ; def v[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -319,39 +372,82 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[56:59], s[16:17] offset:96
+; CHECK-NEXT:    v_accvgpr_read_b32 v0, a32
+; CHECK-NEXT:    v_mov_b32_e32 v60, 0
+; CHECK-NEXT:    v_accvgpr_read_b32 v24, a56
+; CHECK-NEXT:    v_accvgpr_read_b32 v25, a57
+; CHECK-NEXT:    v_accvgpr_read_b32 v26, a58
+; CHECK-NEXT:    v_accvgpr_read_b32 v27, a59
+; CHECK-NEXT:    global_store_dwordx4 v60, v[56:59], s[16:17] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v60, v[52:55], s[16:17] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v60, v[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v60, v[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v60, v[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v60, v[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v60, v[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[36:39], s[16:17] offset:16
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a33
+; CHECK-NEXT:    v_accvgpr_read_b32 v2, a34
+; CHECK-NEXT:    v_accvgpr_read_b32 v3, a35
+; CHECK-NEXT:    v_accvgpr_read_b32 v4, a36
+; CHECK-NEXT:    v_accvgpr_read_b32 v5, a37
+; CHECK-NEXT:    v_accvgpr_read_b32 v6, a38
+; CHECK-NEXT:    v_accvgpr_read_b32 v7, a39
+; CHECK-NEXT:    v_accvgpr_read_b32 v8, a40
+; CHECK-NEXT:    v_accvgpr_read_b32 v9, a41
+; CHECK-NEXT:    v_accvgpr_read_b32 v10, a42
+; CHECK-NEXT:    v_accvgpr_read_b32 v11, a43
+; CHECK-NEXT:    v_accvgpr_read_b32 v12, a44
+; CHECK-NEXT:    v_accvgpr_read_b32 v13, a45
+; CHECK-NEXT:    v_accvgpr_read_b32 v14, a46
+; CHECK-NEXT:    v_accvgpr_read_b32 v15, a47
+; CHECK-NEXT:    v_accvgpr_read_b32 v16, a48
+; CHECK-NEXT:    v_accvgpr_read_b32 v17, a49
+; CHECK-NEXT:    v_accvgpr_read_b32 v18, a50
+; CHECK-NEXT:    v_accvgpr_read_b32 v19, a51
+; CHECK-NEXT:    v_accvgpr_read_b32 v20, a52
+; CHECK-NEXT:    v_accvgpr_read_b32 v21, a53
+; CHECK-NEXT:    v_accvgpr_read_b32 v22, a54
+; CHECK-NEXT:    v_accvgpr_read_b32 v23, a55
+; CHECK-NEXT:    v_accvgpr_read_b32 v28, a60
+; CHECK-NEXT:    v_accvgpr_read_b32 v29, a61
+; CHECK-NEXT:    v_accvgpr_read_b32 v30, a62
+; CHECK-NEXT:    v_accvgpr_read_b32 v31, a63
+; CHECK-NEXT:    global_store_dwordx4 v60, v[24:27], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v60, v[28:31], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[56:59], s[16:17] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v60, v[16:19], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v60, v[20:23], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v60, v[8:11], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v60, v[12:15], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v60, v[0:3], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v60, v[4:7], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[32:35], s[16:17]
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v60, v[0:3], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[8:11], s[16:17]
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[12:15], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v60, v[0:3], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload

>From 71f05ffb2bccfcb44c2b207118e5ad5343724473 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Sun, 21 Sep 2025 02:25:04 -0400
Subject: [PATCH 09/20] Updated mir test

---
 ...amdgcn.mfma.hint.hazard.barrier.gfx942.mir | 1443 +++--------------
 1 file changed, 195 insertions(+), 1248 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
index 271b36fad2bb4..97305f2c8a8f0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck -check-prefix=GFX942_WITHOUT %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=true %s -o - | FileCheck -check-prefix=GFX942_WITH %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-avoid-hazard-hint-for-mfma=false - %s -o - | FileCheck -check-prefix=CHECK-NO-ANTIHINT %s
 
 --- |
   target triple = "amdgcn-amd-amdhsa"
@@ -17,855 +17,153 @@
 name:            test_software_pipelining
 body:             |
   bb.0:
-    ; GFX942_WITHOUT-LABEL: name: test_software_pipelining
-    ; GFX942_WITHOUT: renamable $vgpr115 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr109 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr110 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr108 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr100 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr111 = V_ADD_U32_e32 4096, $vgpr100, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr101 = V_ADD_U32_e32 $vgpr76, killed $vgpr52, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr112 = V_ADD_U32_e32 4096, $vgpr101, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr112, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 renamable $vgpr108, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr80_vgpr81, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr0 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr76, killed $vgpr0, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr82_vgpr83, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr92_vgpr93, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr94_vgpr95, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr108, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr80_vgpr81, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr82_vgpr83, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr108, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr80_vgpr81, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr72_vgpr73_vgpr74_vgpr75, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr82_vgpr83, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr92_vgpr93, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr94_vgpr95, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = DS_READ_B128_gfx9 renamable $vgpr108, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr80_vgpr81, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr108, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr50_vgpr51, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr92_vgpr93, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr50_vgpr51, $vgpr94_vgpr95, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr110, 0, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, killed $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr92_vgpr93, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, killed $vgpr94_vgpr95, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr120 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr104_vgpr105, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr106_vgpr107, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr104_vgpr105, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr114 = V_ADD_U32_e32 $vgpr115, killed $vgpr16, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr104_vgpr105, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr113 = V_ADD_U32_e32 $vgpr115, killed $vgpr20, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr106_vgpr107, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr102_vgpr103, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr104_vgpr105, killed $vgpr96_vgpr97_vgpr98_vgpr99, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr78_vgpr79, $vgpr106_vgpr107, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr78_vgpr79, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr118_vgpr119_vgpr120_vgpr121 = DS_READ_B128_gfx9 killed renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr104_vgpr105, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr116 = V_ADD_U32_e32 $vgpr115, killed $vgpr56, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr106_vgpr107, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr100_vgpr101, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr115 = V_ADD_U32_e32 killed $vgpr115, killed $vgpr72, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr102_vgpr103, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr115, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279
-    ; GFX942_WITHOUT-NEXT: S_BARRIER
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr104_vgpr105, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr120_vgpr121, killed $vgpr106_vgpr107, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr120_vgpr121, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = DS_READ_B128_gfx9 renamable $vgpr108, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr106_vgpr107, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr104_vgpr105_vgpr106_vgpr107 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr112, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr108, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr2_vgpr3, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 killed renamable $vgpr110, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr92 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = DS_READ_B128_gfx9 renamable $vgpr92, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr92, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 renamable $vgpr92, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr82_vgpr83, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr82_vgpr83, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr92, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr115, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, $vgpr10_vgpr11, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr92, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr92, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr109, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr10_vgpr11, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr12_vgpr13, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr14_vgpr15, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 killed renamable $vgpr92, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr8_vgpr9, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr10_vgpr11, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279
-    ; GFX942_WITHOUT-NEXT: S_BARRIER
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr108, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = DS_READ_B128_gfx9 killed renamable $vgpr108, 0, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0
-    ; GFX942_WITHOUT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: test_software_pipelining
+    ; CHECK: dead renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr72 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr68 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr73 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr52 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr74 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr56 = V_ADD_U32_e32 4096, $vgpr74, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr75 = V_ADD_U32_e32 $vgpr68, killed $vgpr52, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr52 = V_ADD_U32_e32 4096, $vgpr75, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr52, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr56, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr73, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: dead renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr54_vgpr55, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr56_vgpr57, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr73, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: dead renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr46_vgpr47, $vgpr58_vgpr59, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr52_vgpr53, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr46_vgpr47, $vgpr54_vgpr55, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr56_vgpr57, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr69 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr68 = V_ADD_U32_e32 killed $vgpr68, killed $vgpr69, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr68_vgpr69_vgpr70_vgpr71 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr74, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr58_vgpr59, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr52_vgpr53, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr54_vgpr55, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr73, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr68_vgpr69_vgpr70_vgpr71 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr75, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr54_vgpr55, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr73, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr56_vgpr57, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = IMPLICIT_DEF
+    ; CHECK-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr72, killed renamable $vgpr68_vgpr69_vgpr70_vgpr71, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; CHECK-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr58_vgpr59, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr52_vgpr53, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr54_vgpr55, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr73, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
+    ; CHECK-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr72, killed renamable $vgpr48_vgpr49_vgpr50_vgpr51, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; CHECK-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, killed $vgpr54_vgpr55, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = DS_READ_B128_gfx9 killed renamable $vgpr73, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr64_vgpr65, killed $vgpr56_vgpr57, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr40_vgpr41_vgpr42_vgpr43 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr74, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
     ;
-    ; GFX942_WITH-LABEL: name: test_software_pipelining
-    ; GFX942_WITH: renamable $vgpr96 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr121 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr122 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr52 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr120 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr123 = V_ADD_U32_e32 4096, $vgpr97, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr102 = V_ADD_U32_e32 $vgpr52, killed $vgpr0, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr124 = V_ADD_U32_e32 4096, $vgpr102, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr124, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr52, killed $vgpr0, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr6_vgpr7, $vgpr82_vgpr83, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr92_vgpr93, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr6_vgpr7, $vgpr94_vgpr95, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr122, 0, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr80_vgpr81, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, killed $vgpr82_vgpr83, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, killed $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr97, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr112_vgpr113, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr97, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr112_vgpr113, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr112_vgpr113, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr16 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr126 = V_ADD_U32_e32 $vgpr96, killed $vgpr16, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr100_vgpr101, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr104_vgpr105_vgpr106_vgpr107, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr20 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr125 = V_ADD_U32_e32 $vgpr96, killed $vgpr20, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr116_vgpr117_vgpr118_vgpr119, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr108_vgpr109_vgpr110_vgpr111, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr104_vgpr105_vgpr106_vgpr107, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 killed renamable $vgpr97, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr56 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr104 = V_ADD_U32_e32 $vgpr96, killed $vgpr56, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr60 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr127 = V_ADD_U32_e32 killed $vgpr96, killed $vgpr60, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr127, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: S_WAITCNT 49279
-    ; GFX942_WITH-NEXT: S_BARRIER
-    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr112_vgpr113, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, killed $vgpr114_vgpr115, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_BARRIER 0
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr124, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 killed renamable $vgpr122, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr105 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr105, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 renamable $vgpr105, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr105, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr84_vgpr85_vgpr86_vgpr87 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr105, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr88_vgpr89_vgpr90_vgpr91 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr127, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr105, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr105, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr121, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 killed renamable $vgpr105, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr8_vgpr9, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr12_vgpr13, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: S_WAITCNT 49279
-    ; GFX942_WITH-NEXT: S_BARRIER
-    ; GFX942_WITH-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = DS_READ_B128_gfx9 killed renamable $vgpr120, 0, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_BARRIER 0
-    ; GFX942_WITH-NEXT: S_ENDPGM 0
+    ; CHECK-NO-ANTIHINT-LABEL: name: test_software_pipelining
+    ; CHECK-NO-ANTIHINT: dead renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr68 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr69 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr70 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr52 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr71 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr56 = V_ADD_U32_e32 4096, $vgpr71, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr72 = V_ADD_U32_e32 $vgpr69, killed $vgpr52, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr52 = V_ADD_U32_e32 4096, $vgpr72, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr52, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr56, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr70, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr54_vgpr55, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr56_vgpr57, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = DS_READ_B128_gfx9 renamable $vgpr70, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr46_vgpr47, $vgpr58_vgpr59, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr52_vgpr53, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr46_vgpr47, $vgpr54_vgpr55, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr56_vgpr57, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr36 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr36 = V_ADD_U32_e32 killed $vgpr69, killed $vgpr36, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr71, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr58_vgpr59, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr52_vgpr53, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr54_vgpr55, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr70, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr56_vgpr57, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr72, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr50_vgpr51, $vgpr58_vgpr59, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr52_vgpr53, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr50_vgpr51, $vgpr54_vgpr55, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr70, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr56_vgpr57, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr68, killed renamable $vgpr24_vgpr25_vgpr26_vgpr27, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr58_vgpr59, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr52_vgpr53, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr54_vgpr55, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = DS_READ_B128_gfx9 renamable $vgpr70, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr56_vgpr57, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr14_vgpr15_vgpr16_vgpr17 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr68, killed renamable $vgpr14_vgpr15_vgpr16_vgpr17, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr58_vgpr59, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr52_vgpr53, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, killed $vgpr54_vgpr55, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 killed renamable $vgpr70, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr12_vgpr13, killed $vgpr56_vgpr57, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr71, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
     %0:vgpr_32 = IMPLICIT_DEF
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32 = IMPLICIT_DEF
@@ -890,403 +188,52 @@ body:             |
     %21:vreg_128_align2 = IMPLICIT_DEF
     %22:vreg_128_align2 = IMPLICIT_DEF
     %23:vreg_128_align2 = IMPLICIT_DEF
-    %25:vgpr_32 = IMPLICIT_DEF
-    %24:vgpr_32 = V_ADD_U32_e32 4096, %25:vgpr_32, implicit $exec
-    %27:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %7:vgpr_32, implicit $exec
-    %26:vgpr_32 = V_ADD_U32_e32 4096, %27:vgpr_32, implicit $exec
-    %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %31:vreg_128_align2 = IMPLICIT_DEF
-    %30:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %23:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %32:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %30:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %22:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %34:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %37:vreg_128_align2 = IMPLICIT_DEF
-    %36:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %21:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %38:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %36:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %20:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %40:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %19:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = V_ADD_U32_e32 4096, %24, implicit $exec
+    %26:vgpr_32 = V_ADD_U32_e32 %3, %7, implicit $exec
+    %27:vgpr_32 = V_ADD_U32_e32 4096, %26, implicit $exec
+    %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %30:vreg_128_align2 = IMPLICIT_DEF
+    %31:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub0_sub1, %29.sub0_sub1, %23, 0, 0, 0, implicit $mode, implicit $exec
+    %32:vreg_128_align2 = DS_READ_B128_gfx9 %4, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub2_sub3, %29.sub2_sub3, %31, 0, 0, 0, implicit $mode, implicit $exec
+    %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub0_sub1, %28.sub0_sub1, %22, 0, 0, 0, implicit $mode, implicit $exec
+    %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub2_sub3, %28.sub2_sub3, %34, 0, 0, 0, implicit $mode, implicit $exec
+    %36:vreg_128_align2 = IMPLICIT_DEF
+    %37:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub0_sub1, %29.sub0_sub1, %21, 0, 0, 0, implicit $mode, implicit $exec
+    %38:vreg_128_align2 = DS_READ_B128_gfx9 %4, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub2_sub3, %29.sub2_sub3, %37, 0, 0, 0, implicit $mode, implicit $exec
+    %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub0_sub1, %28.sub0_sub1, %20, 0, 0, 0, implicit $mode, implicit $exec
+    %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub2_sub3, %28.sub2_sub3, %40, 0, 0, 0, implicit $mode, implicit $exec
+    %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1, %29.sub0_sub1, %19, 0, 0, 0, implicit $mode, implicit $exec
     %43:vgpr_32 = IMPLICIT_DEF
-    %925:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %43:vgpr_32, implicit $exec
-    %44:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %45:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %42:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %46:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %18:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %47:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %46:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %48:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
-    %49:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %17:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %50:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %51:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %49:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %52:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %16:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %53:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %52:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %54:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
-    %55:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %15:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %56:vreg_128_align2 = IMPLICIT_DEF
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %56:vreg_128_align2, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
-    %57:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %55:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %58:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %14:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %59:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %58:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %60:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
-    %61:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %13:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %62:vreg_128_align2 = IMPLICIT_DEF
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %62:vreg_128_align2, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
-    %63:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %61:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %64:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %12:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %65:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %64:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %66:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
-    %67:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %11:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %68:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %69:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %67:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %70:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %10:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %71:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %70:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %72:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3)
-    %73:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %9:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %74:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %75:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %73:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %76:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %8:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %77:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %76:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %79:vgpr_32 = IMPLICIT_DEF
-    %78:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
-    %81:vreg_128_align2 = IMPLICIT_DEF
-    %80:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %33:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %82:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    %83:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %80:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %85:vreg_128_align2 = IMPLICIT_DEF
-    %84:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %35:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %86:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %84:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %87:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %39:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %88:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    %89:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %87:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %90:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %41:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %91:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %90:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %92:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %45:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %94:vgpr_32 = IMPLICIT_DEF
-    %93:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %94:vgpr_32, implicit $exec
-    %95:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %96:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %92:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %97:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %47:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %98:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %97:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %99:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
-    %100:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %51:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %102:vgpr_32 = IMPLICIT_DEF
-    %101:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %102:vgpr_32, implicit $exec
-    %103:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %104:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %100:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %105:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %53:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %106:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %105:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %107:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
-    %108:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %57:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %109:vreg_128_align2 = IMPLICIT_DEF
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %109:vreg_128_align2, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
-    %110:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %108:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %111:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %59:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %112:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %111:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %113:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
-    %114:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %63:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %115:vreg_128_align2 = IMPLICIT_DEF
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %115:vreg_128_align2, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
-    %116:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %114:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %117:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %65:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %118:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %117:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %119:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
-    %120:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %69:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %122:vgpr_32 = IMPLICIT_DEF
-    %121:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %122:vgpr_32, implicit $exec
-    %123:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %124:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %120:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %125:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %71:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %127:vgpr_32 = IMPLICIT_DEF
-    %126:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %127:vgpr_32, implicit $exec
-    %128:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %125:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %129:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    S_WAITCNT 49279
-    S_BARRIER
-    %130:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
-    %131:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %75:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %132:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %131:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %133:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %77:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %134:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %133:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %135:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_BARRIER 0
-    %136:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %83:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %137:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
-    %138:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %136:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %139:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %86:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %140:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %139:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %141:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %89:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %142:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
-    %143:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %141:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %144:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %91:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %145:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %144:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %146:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %96:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %147:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %146:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %148:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %98:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %149:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %148:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %150:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
-    %151:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %104:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %152:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %151:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %153:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %106:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %154:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %153:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %155:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
-    %156:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %110:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %95:vreg_128_align2, 0, 0, implicit $exec :: (store (s128), addrspace 3)
-    %157:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %156:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %158:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %112:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %159:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %158:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %160:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
-    %161:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %116:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %103:vreg_128_align2, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
-    %162:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %161:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %163:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %118:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %164:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %163:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %165:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
-    %166:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %124:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %981:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %167:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %166:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %168:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %128:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %169:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %168:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %170:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
-    %171:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %132:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %985:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %172:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %171:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %173:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %134:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %174:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %173:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %176:vgpr_32 = IMPLICIT_DEF
-    %175:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
-    %177:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %138:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %178:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
-    %179:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %177:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %180:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %140:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %962:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %180:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %182:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %143:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %183:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
-    %961:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %182:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %185:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %145:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %960:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %185:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %187:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %147:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %956:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %959:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %187:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %189:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %149:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %958:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %189:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %191:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
-    %192:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %152:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %962:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %957:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %192:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %194:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %154:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %956:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %194:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %196:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
-    %197:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %157:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %123:vreg_128_align2, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
-    %955:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %197:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %199:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %159:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %954:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %199:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %201:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
-    %202:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %162:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %129:vreg_128_align2, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
-    %953:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %202:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %204:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %164:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %952:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %204:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %206:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
-    %207:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %167:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %910:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %951:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %207:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %209:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %169:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %950:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %209:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %911:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    S_WAITCNT 49279
-    S_BARRIER
-    %937:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
-    %211:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %172:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %949:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %211:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %213:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %174:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %948:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %213:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %931:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3)
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_BARRIER 0
-    S_ENDPGM 0
+    %44:vgpr_32 = V_ADD_U32_e32 %3, %43, implicit $exec
+    %45:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24, %6, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %46:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3, %29.sub2_sub3, %42, 0, 0, 0, implicit $mode, implicit $exec
+    %47:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1, %28.sub0_sub1, %18, 0, 0, 0, implicit $mode, implicit $exec
+    %48:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3, %28.sub2_sub3, %47, 0, 0, 0, implicit $mode, implicit $exec
+    %49:vreg_128_align2 = DS_READ_B128_gfx9 %4, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    %50:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1, %29.sub0_sub1, %17, 0, 0, 0, implicit $mode, implicit $exec
+    %51:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26, %6, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %52:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3, %29.sub2_sub3, %50, 0, 0, 0, implicit $mode, implicit $exec
+    %53:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1, %28.sub0_sub1, %16, 0, 0, 0, implicit $mode, implicit $exec
+    %54:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3, %28.sub2_sub3, %53, 0, 0, 0, implicit $mode, implicit $exec
+    %55:vreg_128_align2 = DS_READ_B128_gfx9 %4, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    %56:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub0_sub1, %29.sub0_sub1, %15, 0, 0, 0, implicit $mode, implicit $exec
+    %57:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1, %57, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    %58:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub2_sub3, %29.sub2_sub3, %56, 0, 0, 0, implicit $mode, implicit $exec
+    %59:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub0_sub1, %28.sub0_sub1, %14, 0, 0, 0, implicit $mode, implicit $exec
+    %60:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub2_sub3, %28.sub2_sub3, %59, 0, 0, 0, implicit $mode, implicit $exec
+    %61:vreg_128_align2 = DS_READ_B128_gfx9 %4, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    %62:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub0_sub1, %29.sub0_sub1, %13, 0, 0, 0, implicit $mode, implicit $exec
+    %63:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1, %63, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    %64:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub2_sub3, %29.sub2_sub3, %62, 0, 0, 0, implicit $mode, implicit $exec
+    %65:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub0_sub1, %28.sub0_sub1, %12, 0, 0, 0, implicit $mode, implicit $exec
+    %66:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub2_sub3, %28.sub2_sub3, %65, 0, 0, 0, implicit $mode, implicit $exec
+    %67:vreg_128_align2 = DS_READ_B128_gfx9 %4, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    %68:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %61.sub0_sub1, %29.sub0_sub1, %11, 0, 0, 0, implicit $mode, implicit $exec
+    %69:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24, %6, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
 ...

>From ff7f1d8a50bad8995c90851a8424e8b70f346243 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Sun, 21 Sep 2025 02:33:00 -0400
Subject: [PATCH 10/20] Renamed test file

---
 ....barrier.gfx942.mir => llvm.amdgcn.mfma.anti-hints.gfx942.mir} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/CodeGen/AMDGPU/{llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir => llvm.amdgcn.mfma.anti-hints.gfx942.mir} (100%)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
rename to llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir

>From db81fe4bb17890ba880e243d5305522835953335 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 14:42:41 -0400
Subject: [PATCH 11/20] Added print and parse tests

---
 ...vm.amdgcn.mfma.anti-hints-parse.gfx942.mir | 195 ++++++++++++++++++
 ...vm.amdgcn.mfma.anti-hints-print.gfx942.mir | 126 +++++++++++
 .../llvm.amdgcn.mfma.anti-hints.gfx942.mir    |   4 +-
 3 files changed, 323 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
new file mode 100644
index 0000000000000..905fff8b642cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
@@ -0,0 +1,195 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -debug -run-pass=greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s
+--- |
+  ; ModuleID = '/work/mdssefat/FullTimeWork/MLSCHED/composable_kernel/noopexample/llvm.amdgcn.mfma.hint.haard.barrier.gfx942_short.mir'
+  source_filename = "/work/mdssefat/FullTimeWork/MLSCHED/composable_kernel/noopexample/llvm.amdgcn.mfma.hint.haard.barrier.gfx942_short.mir"
+  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+  target triple = "amdgcn-amd-amdhsa"
+
+  ; Function Attrs: nounwind
+  define amdgpu_kernel void @test_software_pipelining() #0 {
+  bb.0:
+    ret void
+  }
+
+  attributes #0 = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-waves-per-eu"="2" "frame-pointer"="none" "target-cpu"="gfx942" }
+...
+---
+name:            test_software_pipelining
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+                                                                                '%27',
+                                                                                '%4',
+                                                                                '%26',
+                                                                                '%25',
+                                                                                '%5',
+                                                                                '%24',
+                                                                                '%22',
+                                                                                '%6',
+                                                                                '%20',
+                                                                                '%19',
+                                                                                '%7',
+                                                                                '%18',
+                                                                                '%16',
+                                                                                '%8' ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+                                                                                '%16',
+                                                                                '%8',
+                                                                                '%22',
+                                                                                '%6',
+                                                                                '%20',
+                                                                                '%19',
+                                                                                '%7',
+                                                                                '%18' ] }
+  - { id: 2, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 3, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%17', '%1', '%23', '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 9, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+                                                                                '%27',
+                                                                                '%4',
+                                                                                '%26',
+                                                                                '%25',
+                                                                                '%5',
+                                                                                '%24',
+                                                                                '%22',
+                                                                                '%6',
+                                                                                '%20',
+                                                                                '%19',
+                                                                                '%7',
+                                                                                '%18',
+                                                                                '%16',
+                                                                                '%8' ] }
+  - { id: 10, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 12, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 13, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 14, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 15, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%16', '%8' ] }
+  - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 20, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 21, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%22', '%6', '%20', '%19', '%7', '%18', '%16', '%8' ] }
+  - { id: 24, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 26, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 27, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 28, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+                                                                                 '%27',
+                                                                                 '%4',
+                                                                                 '%26',
+                                                                                 '%25',
+                                                                                 '%5',
+                                                                                 '%24',
+                                                                                 '%22',
+                                                                                 '%6',
+                                                                                 '%20',
+                                                                                 '%19',
+                                                                                 '%7',
+                                                                                 '%18',
+                                                                                 '%16',
+                                                                                 '%8' ] }
+  - { id: 29, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+                                                                                 '%27',
+                                                                                 '%4',
+                                                                                 '%26',
+                                                                                 '%25',
+                                                                                 '%5',
+                                                                                 '%24',
+                                                                                 '%22',
+                                                                                 '%6',
+                                                                                 '%20',
+                                                                                 '%19',
+                                                                                 '%7',
+                                                                                 '%18',
+                                                                                 '%16',
+                                                                                 '%8' ] }
+  - { id: 30, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%27', '%4', '%26', '%25', '%5', '%24', '%22', '%6',
+                    '%20', '%19', '%7', '%18', '%16', '%8' ] }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_software_pipelining
+    ; CHECK: renamable $vgpr36 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr37 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr38 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr24 = V_ADD_U32_e32 4096, $vgpr38, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr20 = V_ADD_U32_e32 $vgpr36, killed $vgpr20, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr20 = V_ADD_U32_e32 4096, killed $vgpr20, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr20, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr24, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr24_vgpr25, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128_gfx9 renamable $vgpr37, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr26_vgpr27, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr20_vgpr21, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr22_vgpr23, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr24_vgpr25, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 killed renamable $vgpr37, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr26_vgpr27, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr20_vgpr21, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, killed $vgpr22_vgpr23, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr32_vgpr33, killed $vgpr24_vgpr25, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr20 = V_ADD_U32_e32 killed $vgpr36, killed $vgpr20, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr38, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr28_vgpr29_vgpr30_vgpr31, implicit killed renamable $vgpr8_vgpr9_vgpr10_vgpr11, implicit killed renamable $vgpr12_vgpr13_vgpr14_vgpr15, implicit killed renamable $vgpr4_vgpr5_vgpr6_vgpr7, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed renamable $vgpr20_vgpr21_vgpr22_vgpr23
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sgpr_128 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vreg_128_align2 = IMPLICIT_DEF
+    %5:vreg_128_align2 = IMPLICIT_DEF
+    %6:vreg_128_align2 = IMPLICIT_DEF
+    %7:vreg_128_align2 = IMPLICIT_DEF
+    %8:vreg_128_align2 = IMPLICIT_DEF
+    %9:vgpr_32 = IMPLICIT_DEF
+    %10:vgpr_32 = V_ADD_U32_e32 4096, %9, implicit $exec
+    %11:vgpr_32 = V_ADD_U32_e32 %0, %3, implicit $exec
+    %12:vgpr_32 = V_ADD_U32_e32 4096, %11, implicit $exec
+    %13:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %12, %2, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %14:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %10, %2, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %15:vreg_128_align2 = IMPLICIT_DEF
+    %16:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub0_sub1, %14.sub0_sub1, %8, 0, 0, 0, implicit $mode, implicit $exec
+    %17:vreg_128_align2 = DS_READ_B128_gfx9 %1, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    dead %18:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub2_sub3, %14.sub2_sub3, %16, 0, 0, 0, implicit $mode, implicit $exec
+    %19:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub0_sub1, %13.sub0_sub1, %7, 0, 0, 0, implicit $mode, implicit $exec
+    %20:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub2_sub3, %13.sub2_sub3, %19, 0, 0, 0, implicit $mode, implicit $exec
+    %21:vreg_128_align2 = IMPLICIT_DEF
+    %22:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub0_sub1, %14.sub0_sub1, %6, 0, 0, 0, implicit $mode, implicit $exec
+    %23:vreg_128_align2 = DS_READ_B128_gfx9 %1, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    %24:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub2_sub3, %14.sub2_sub3, %22, 0, 0, 0, implicit $mode, implicit $exec
+    %25:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub0_sub1, %13.sub0_sub1, %5, 0, 0, 0, implicit $mode, implicit $exec
+    %26:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub2_sub3, %13.sub2_sub3, %25, 0, 0, 0, implicit $mode, implicit $exec
+    %27:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %17.sub0_sub1, %14.sub0_sub1, %4, 0, 0, 0, implicit $mode, implicit $exec
+    %28:vgpr_32 = IMPLICIT_DEF
+    dead %29:vgpr_32 = V_ADD_U32_e32 %0, %28, implicit $exec
+    %30:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %9, %2, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    S_ENDPGM 0, implicit %23, implicit %24, implicit %20, implicit %26, implicit %27, implicit %30
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir
new file mode 100644
index 0000000000000..d55dbb4ea0e5f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir
@@ -0,0 +1,126 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations %s -o - | FileCheck -check-prefix=CHECK %s
+
+--- |
+  target triple = "amdgcn-amd-amdhsa"
+
+  define amdgpu_kernel void @test_software_pipelining() #0 {
+    bb.0:
+      ret void
+  }
+
+  attributes #0 = {nounwind "amdgpu-waves-per-eu"="2"  "amdgpu-agpr-alloc"="0" "frame-pointer"="none"}
+
+...
+---
+name:            test_software_pipelining
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_software_pipelining
+    ; CHECK: registers:
+    ; CHECK:  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK:{{\s*}}'%8' ] }
+    ; CHECK:  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+    ; CHECK-NEXT:{{\s*}}'%16',
+    ; CHECK:{{\s*}}'%18' ] }
+    ; CHECK:  - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 5, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 6, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 7, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 8, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%17'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 9, class: vgpr_32, preferred-register:  '', flags: [  ], anti-hints: [
+    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK:{{\s*}}'%8' ] }
+    ; CHECK:  - { id: 16, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 17, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%16'{{.*}}'%8' ] }
+    ; CHECK:  - { id: 18, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 19, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 22, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 23, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%22'{{.*}}'%8' ] }
+    ; CHECK:  - { id: 25, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 28, class: vgpr_32, preferred-register:  '', flags: [  ], anti-hints: [
+    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK:{{\s*}}'%8' ] }
+    ; CHECK:  - { id: 29, class: vgpr_32, preferred-register:  '', flags: [ ], anti-hints: [
+    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK:{{\s*}}'%8' ] }
+    ; CHECK:  - { id: 30, class: vreg_128_align2
+    ; CHECK-NEXT: {{.*}}anti-hints: [ '%27'
+    ; CHECK: {{.*}}'%8' ] }
+    ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 4096, [[DEF9]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF]], [[DEF3]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 4096, [[V_ADD_U32_e32_1]], implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_2]], [[DEF2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_]], [[DEF2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF8]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF1]], 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: dead [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[DEF7]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF6]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF1]], 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[DEF5]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF4]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF]], [[DEF12]], implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF9]], [[DEF2]], 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[DS_READ_B128_gfx9_1]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8]], implicit [[BUFFER_LOAD_DWORDX4_OFFEN2]]
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = IMPLICIT_DEF
+    %6:sgpr_128 = IMPLICIT_DEF
+    %7:vgpr_32 = IMPLICIT_DEF
+    %19:vreg_128_align2 = IMPLICIT_DEF
+    %20:vreg_128_align2 = IMPLICIT_DEF
+    %21:vreg_128_align2 = IMPLICIT_DEF
+    %22:vreg_128_align2 = IMPLICIT_DEF
+    %23:vreg_128_align2 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = V_ADD_U32_e32 4096, %25:vgpr_32, implicit $exec
+    %27:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %7:vgpr_32, implicit $exec
+    %26:vgpr_32 = V_ADD_U32_e32 4096, %27:vgpr_32, implicit $exec
+    %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %31:vreg_128_align2 = IMPLICIT_DEF
+    %30:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %23:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %32:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %30:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %22:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %34:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %37:vreg_128_align2 = IMPLICIT_DEF
+    %36:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %21:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %38:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %36:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %20:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %40:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %19:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %43:vgpr_32 = IMPLICIT_DEF
+    %925:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %43:vgpr_32, implicit $exec
+    %44:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    S_ENDPGM 0, implicit %38, implicit %39, implicit %35, implicit %41, implicit %42, implicit %44
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
index 97305f2c8a8f0..d360eccaeb773 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-avoid-hazard-hint-for-mfma=false - %s -o - | FileCheck -check-prefix=CHECK-NO-ANTIHINT %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck %s --check-prefix=CHECK
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck %s --check-prefix=CHECK-NO-ANTIHINT
 
 --- |
   target triple = "amdgcn-amd-amdhsa"

>From 51d34ed9dc5beb527507aec3cb1903b00f1b2dc8 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 15:34:06 -0400
Subject: [PATCH 12/20] Fixed typo

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index f63eea716d68b..e0eecb06e2d32 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -323,7 +323,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
               // Check if MFMA register is dead at current instruction
               const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg);
               if (!MFMAInterval.liveAt(CurrentSlot)) {
-                // Add bidirectional antihints
+                // Add bi-directional anti-hints
                 MRI->addRegAllocationAntiHints(CandidateReg, MFMARegs);
                 MRI->addRegAllocationAntiHints(MFMAReg, CandidateReg);
               }

>From afb24b12e9d8a3d93dfe0cd2e7cbec5e11cb03f5 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 18:02:20 -0400
Subject: [PATCH 13/20] Fixed typo

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index e0eecb06e2d32..098ca1120c85c 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -324,7 +324,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
               const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg);
               if (!MFMAInterval.liveAt(CurrentSlot)) {
                 // Add bi-directional anti-hints
-                MRI->addRegAllocationAntiHints(CandidateReg, MFMARegs);
+                MRI->addRegAllocationAntiHints(CandidateReg, MFMAReg);
                 MRI->addRegAllocationAntiHints(MFMAReg, CandidateReg);
               }
             }

>From a1a7833373387f6b699b674623061a4e95d853f1 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 18:28:30 -0400
Subject: [PATCH 14/20] Fixed test!

---
 ...lvm.amdgcn.mfma.anti-hints-print.gfx942.mir | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir
index d55dbb4ea0e5f..c6de026d447fd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir
@@ -18,11 +18,11 @@ body:             |
     ; CHECK-LABEL: name: test_software_pipelining
     ; CHECK: registers:
     ; CHECK:  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK-NEXT:{{\s*}}'%4',
     ; CHECK:{{\s*}}'%8' ] }
     ; CHECK:  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%16',
-    ; CHECK:{{\s*}}'%18' ] }
+    ; CHECK-NEXT:{{\s*}}'%8',
+    ; CHECK:{{\s*}}'%16' ] }
     ; CHECK:  - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [  ],
     ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
     ; CHECK:  - { id: 5, class: vreg_128_align2, preferred-register:  '', flags: [  ],
@@ -34,12 +34,12 @@ body:             |
     ; CHECK:  - { id: 8, class: vreg_128_align2, preferred-register:  '', flags: [  ],
     ; CHECK-NEXT:{{\s*}}anti-hints: [ '%17'{{.*}}'%9' ] }
     ; CHECK:  - { id: 9, class: vgpr_32, preferred-register:  '', flags: [  ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK-NEXT:{{\s*}}'%4',
     ; CHECK:{{\s*}}'%8' ] }
     ; CHECK:  - { id: 16, class: vreg_128_align2, preferred-register:  '', flags: [  ],
     ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
     ; CHECK:  - { id: 17, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%16'{{.*}}'%8' ] }
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%8' ] }
     ; CHECK:  - { id: 18, class: vreg_128_align2, preferred-register:  '', flags: [  ],
     ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
     ; CHECK:  - { id: 19, class: vreg_128_align2, preferred-register:  '', flags: [  ],
@@ -47,17 +47,17 @@ body:             |
     ; CHECK:  - { id: 22, class: vreg_128_align2, preferred-register:  '', flags: [  ],
     ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
     ; CHECK:  - { id: 23, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%22'{{.*}}'%8' ] }
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%6'{{.*}}'%8' ] }
     ; CHECK:  - { id: 25, class: vreg_128_align2, preferred-register:  '', flags: [  ],
     ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
     ; CHECK:  - { id: 28, class: vgpr_32, preferred-register:  '', flags: [  ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK-NEXT:{{\s*}}'%4',
     ; CHECK:{{\s*}}'%8' ] }
     ; CHECK:  - { id: 29, class: vgpr_32, preferred-register:  '', flags: [ ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK-NEXT:{{\s*}}'%4',
     ; CHECK:{{\s*}}'%8' ] }
     ; CHECK:  - { id: 30, class: vreg_128_align2
-    ; CHECK-NEXT: {{.*}}anti-hints: [ '%27'
+    ; CHECK-NEXT: {{.*}}anti-hints: [ '%4'
     ; CHECK: {{.*}}'%8' ] }
     ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF

>From f7d920f1bf4bb6590ae1593fb3e2a23ef1086656 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 18:33:12 -0400
Subject: [PATCH 15/20] Fixed test

---
 ...vm.amdgcn.mfma.anti-hints-parse.gfx942.mir | 146 ++++++++----------
 1 file changed, 64 insertions(+), 82 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
index 905fff8b642cc..89ac0978a0f72 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
@@ -16,56 +16,46 @@
 ---
 name:            test_software_pipelining
 registers:
-  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-                                                                                '%27',
-                                                                                '%4',
-                                                                                '%26',
-                                                                                '%25',
-                                                                                '%5',
-                                                                                '%24',
-                                                                                '%22',
-                                                                                '%6',
-                                                                                '%20',
-                                                                                '%19',
-                                                                                '%7',
-                                                                                '%18',
-                                                                                '%16',
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
+                                                                                '%4', 
+                                                                                '%25', 
+                                                                                '%5', 
+                                                                                '%22', 
+                                                                                '%6', 
+                                                                                '%19', 
+                                                                                '%7', 
+                                                                                '%18', 
+                                                                                '%16', 
                                                                                 '%8' ] }
-  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-                                                                                '%16',
-                                                                                '%8',
-                                                                                '%22',
-                                                                                '%6',
-                                                                                '%20',
-                                                                                '%19',
-                                                                                '%7',
-                                                                                '%18' ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
+                                                                                '%8', 
+                                                                                '%6', 
+                                                                                '%19', 
+                                                                                '%7', 
+                                                                                '%18', 
+                                                                                '%16' ] }
   - { id: 2, class: sgpr_128, preferred-register: '', flags: [  ] }
   - { id: 3, class: vgpr_32, preferred-register: '', flags: [  ] }
-  - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%17', '%1', '%23', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 9, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-                                                                                '%27',
-                                                                                '%4',
-                                                                                '%26',
-                                                                                '%25',
-                                                                                '%5',
-                                                                                '%24',
-                                                                                '%22',
-                                                                                '%6',
-                                                                                '%20',
-                                                                                '%19',
-                                                                                '%7',
-                                                                                '%18',
-                                                                                '%16',
+  - { id: 9, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
+                                                                                '%4', 
+                                                                                '%25', 
+                                                                                '%5', 
+                                                                                '%22', 
+                                                                                '%6', 
+                                                                                '%19', 
+                                                                                '%7', 
+                                                                                '%18', 
+                                                                                '%16', 
                                                                                 '%8' ] }
   - { id: 10, class: vgpr_32, preferred-register: '', flags: [  ] }
   - { id: 11, class: vgpr_32, preferred-register: '', flags: [  ] }
@@ -73,58 +63,50 @@ registers:
   - { id: 13, class: vreg_128_align2, preferred-register: '', flags: [  ] }
   - { id: 14, class: vreg_128_align2, preferred-register: '', flags: [  ] }
   - { id: 15, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [  ],
-      anti-hints: [ '%16', '%8' ] }
-  - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [  ], 
+      anti-hints: [ '%8' ] }
+  - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
   - { id: 20, class: vreg_128_align2, preferred-register: '', flags: [  ] }
   - { id: 21, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [  ],
-      anti-hints: [ '%22', '%6', '%20', '%19', '%7', '%18', '%16', '%8' ] }
+  - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [  ], 
+      anti-hints: [ '%6', '%19', '%7', '%18', '%16', '%8' ] }
   - { id: 24, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
   - { id: 26, class: vreg_128_align2, preferred-register: '', flags: [  ] }
   - { id: 27, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 28, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-                                                                                 '%27',
-                                                                                 '%4',
-                                                                                 '%26',
-                                                                                 '%25',
-                                                                                 '%5',
-                                                                                 '%24',
-                                                                                 '%22',
-                                                                                 '%6',
-                                                                                 '%20',
-                                                                                 '%19',
-                                                                                 '%7',
-                                                                                 '%18',
-                                                                                 '%16',
+  - { id: 28, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
+                                                                                 '%4', 
+                                                                                 '%25', 
+                                                                                 '%5', 
+                                                                                 '%22', 
+                                                                                 '%6', 
+                                                                                 '%19', 
+                                                                                 '%7', 
+                                                                                 '%18', 
+                                                                                 '%16', 
                                                                                  '%8' ] }
-  - { id: 29, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-                                                                                 '%27',
-                                                                                 '%4',
-                                                                                 '%26',
-                                                                                 '%25',
-                                                                                 '%5',
-                                                                                 '%24',
-                                                                                 '%22',
-                                                                                 '%6',
-                                                                                 '%20',
-                                                                                 '%19',
-                                                                                 '%7',
-                                                                                 '%18',
-                                                                                 '%16',
+  - { id: 29, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
+                                                                                 '%4', 
+                                                                                 '%25', 
+                                                                                 '%5', 
+                                                                                 '%22', 
+                                                                                 '%6', 
+                                                                                 '%19', 
+                                                                                 '%7', 
+                                                                                 '%18', 
+                                                                                 '%16', 
                                                                                  '%8' ] }
-  - { id: 30, class: vreg_128_align2, preferred-register: '', flags: [  ],
-      anti-hints: [ '%27', '%4', '%26', '%25', '%5', '%24', '%22', '%6',
-                    '%20', '%19', '%7', '%18', '%16', '%8' ] }
+  - { id: 30, class: vreg_128_align2, preferred-register: '', flags: [  ], 
+      anti-hints: [ '%4', '%25', '%5', '%22', '%6', '%19', '%7', '%18', 
+                    '%16', '%8' ] }
 body:             |
   bb.0:
     ; CHECK-LABEL: name: test_software_pipelining

>From 69d5b6e8de87af4c78c8e09fad85c1f7b8da40b5 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 19:10:29 -0400
Subject: [PATCH 16/20] Fixed typo

---
 llvm/lib/CodeGen/AllocationOrder.cpp | 4 ++--
 llvm/lib/CodeGen/AllocationOrder.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/AllocationOrder.cpp b/llvm/lib/CodeGen/AllocationOrder.cpp
index f57df79128c64..f420c96e212d0 100644
--- a/llvm/lib/CodeGen/AllocationOrder.cpp
+++ b/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -62,7 +62,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
   // Create allocation order object
   AllocationOrder AO(std::move(Hints), Order, HardHints);
   
-  // Apply anti-hint filtering if needed
+  // Apply anti-hints filtering if needed
   if (!AntiHintedPhysRegs.empty()) {
     AO.applyAntiHints(AntiHintedPhysRegs, TRI);
     
@@ -103,7 +103,7 @@ void AllocationOrder::applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs,
     }
   }
   
-  // Update Order to point to our filtered storage
+  // Update Order
   Order = FilteredOrderStorage;
   
   LLVM_DEBUG({
diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h
index 842f83d957a6d..029d9c83baf35 100644
--- a/llvm/lib/CodeGen/AllocationOrder.h
+++ b/llvm/lib/CodeGen/AllocationOrder.h
@@ -120,7 +120,7 @@ class LLVM_LIBRARY_VISIBILITY AllocationOrder {
     return Reg.isPhysical() && is_contained(Hints, Reg.id());
   }
   
-  /// Apply antihint to the allocation order.
+  /// Apply anti-hints to the allocation order.
   void applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs, 
                       const TargetRegisterInfo *TRI);
 

>From 3ff89f1037604cd76e4b67fb8c3d3ff2a6e457e9 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 19:31:15 -0400
Subject: [PATCH 17/20] [AMDGPU] Anti-hints in register allocation

---
 .../include/llvm/CodeGen/MIRParser/MIParser.h |  2 +-
 llvm/include/llvm/CodeGen/MIRYamlMapping.h    |  5 +++--
 .../llvm/CodeGen/MachineRegisterInfo.h        | 19 +++++++++--------
 llvm/lib/CodeGen/AllocationOrder.cpp          | 21 +++++++++----------
 llvm/lib/CodeGen/AllocationOrder.h            |  5 ++---
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp      |  7 +++----
 llvm/lib/CodeGen/MachineRegisterInfo.cpp      | 12 +++++------
 7 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 1d0a745d5f983..cf7a56587397d 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -45,7 +45,7 @@ struct VRegInfo {
   } D;
   Register VReg;
   Register PreferredReg;
-  SmallVector<Register, 4> AntiHints;  // Anti-hints
+  SmallVector<Register, 4> AntiHints; // Anti-hints
   uint8_t Flags = 0;
 };
 
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index 20cc3c370dc66..9c0056fc03376 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -210,9 +210,10 @@ template <> struct MappingTraits<VirtualRegisterDefinition> {
                        StringValue()); // Don't print out when it's empty.
     YamlIO.mapOptional("flags", Reg.RegisterFlags,
                        std::vector<FlowStringValue>());
-    if(!YamlIO.outputting() || !Reg.AntiHints.empty()) {  // Only map when parsing or anti-hints present
+    if (!YamlIO.outputting() ||
+        !Reg.AntiHints.empty()) { // Only map when parsing or anti-hints present
       YamlIO.mapOptional("anti-hints", Reg.AntiHints,
-                       std::vector<FlowStringValue>());  // for anti-hints
+                         std::vector<FlowStringValue>()); // for anti-hints
     }
   }
 
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index bcee5d6b30439..5f00aeebb46fe 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -880,20 +880,21 @@ class MachineRegisterInfo {
       AntiHints.push_back(AntiHintVReg);
   }
 
-  /// addRegAllocationAntiHint - Add multiple anti-hints at once
-  void addRegAllocationAntiHints(Register VReg, ArrayRef<Register> AntiHintVRegs) {
+  /// addRegAllocationAntiHint - Add multiple anti-hints at once.
+  void addRegAllocationAntiHints(Register VReg,
+                                 ArrayRef<Register> AntiHintVRegs) {
     for (Register AntiHint : AntiHintVRegs)
       setRegAllocationAntiHint(VReg, AntiHint);
   }
 
-  /// clearRegAllocationAntiHints - Clear all anti-hints for a register
+  /// clearRegAllocationAntiHints - Clear all anti-hints for a register.
   void clearRegAllocationAntiHints(Register VReg) {
     assert(VReg.isVirtual());
     if (AntiHintRegs.inBounds(VReg))
       AntiHintRegs[VReg].clear();
   }
 
-  /// getRegAllocationAntiHints - Return the vector of anti-hints for VReg
+  /// getRegAllocationAntiHints - Return the vector of anti-hints for VReg.
   ArrayRef<Register> getRegAllocationAntiHints(Register VReg) const {
     assert(VReg.isVirtual());
     if (!AntiHintRegs.inBounds(VReg))
@@ -901,7 +902,7 @@ class MachineRegisterInfo {
     return AntiHintRegs[VReg];
   }
 
-  /// hasRegAllocationAntiHint - Check if VReg has AntiHintVReg as an anti-hint
+  /// hasRegAllocationAntiHint - Check if VReg has AntiHintVReg as an anti-hint.
   bool hasRegAllocationAntiHint(Register VReg, Register AntiHintVReg) const {
     assert(VReg.isVirtual() && AntiHintVReg.isVirtual());
     if (!AntiHintRegs.inBounds(VReg))
@@ -910,11 +911,11 @@ class MachineRegisterInfo {
     return llvm::find(AntiHints, AntiHintVReg) != AntiHints.end();
   }
 
-  /// getPhysRegAntiHints - Get the set of physical registers to avoid based on
-  /// anti-hints and current allocations. This is called during allocation.
+  /// getPhysRegAntiHints - Get the set of physical registers to avoid.
   /// VRM is the current virtual register map showing allocations made so far.
-  void getPhysRegAntiHints(Register VReg, SmallVectorImpl<MCPhysReg> &PhysAntiHints,
-                          const VirtRegMap *VRM) const;
+  void getPhysRegAntiHints(Register VReg,
+                           SmallVectorImpl<MCPhysReg> &PhysAntiHints,
+                           const VirtRegMap *VRM) const;
 
   /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the
   /// specified register as undefined which causes the DBG_VALUE to be
diff --git a/llvm/lib/CodeGen/AllocationOrder.cpp b/llvm/lib/CodeGen/AllocationOrder.cpp
index f420c96e212d0..8550759f97e8a 100644
--- a/llvm/lib/CodeGen/AllocationOrder.cpp
+++ b/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -49,7 +49,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
   // Get anti-hints
   SmallVector<MCPhysReg, 16> AntiHintedPhysRegs;
   MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, &VRM);
-  
+
   LLVM_DEBUG({
     if (!AntiHintedPhysRegs.empty()) {
       dbgs() << "anti-hints:";
@@ -58,14 +58,14 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
       dbgs() << '\n';
     }
   });
-  
+
   // Create allocation order object
   AllocationOrder AO(std::move(Hints), Order, HardHints);
-  
+
   // Apply anti-hints filtering if needed
   if (!AntiHintedPhysRegs.empty()) {
     AO.applyAntiHints(AntiHintedPhysRegs, TRI);
-    
+
     LLVM_DEBUG({
       if (!AO.Hints.empty()) {
         dbgs() << "filtered hints:";
@@ -76,38 +76,37 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
     });
   }
 
-
   assert(all_of(AO.Hints,
                 [&](MCPhysReg Hint) { return is_contained(AO.Order, Hint); }) &&
          "Target hint is outside allocation order.");
   return AO;
 }
 
-void AllocationOrder::applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs, 
+void AllocationOrder::applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs,
                                      const TargetRegisterInfo *TRI) {
   // Create filtered order
   FilteredOrderStorage.clear();
   FilteredOrderStorage.reserve(Order.size());
-  
+
   // Add non-anti-hinted registers first
   for (MCPhysReg PhysReg : Order) {
     if (!is_contained(AntiHintedPhysRegs, PhysReg)) {
       FilteredOrderStorage.push_back(PhysReg);
     }
   }
-  
+
   // Add anti-hinted registers at the end as last resort
   for (MCPhysReg PhysReg : Order) {
     if (is_contained(AntiHintedPhysRegs, PhysReg)) {
       FilteredOrderStorage.push_back(PhysReg);
     }
   }
-  
+
   // Update Order
   Order = FilteredOrderStorage;
-  
+
   LLVM_DEBUG({
-    dbgs() << "moved " << AntiHintedPhysRegs.size() 
+    dbgs() << "moved " << AntiHintedPhysRegs.size()
            << " anti-hinted registers to end of allocation order\n";
   });
 }
diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h
index 029d9c83baf35..cda5fd08e0af6 100644
--- a/llvm/lib/CodeGen/AllocationOrder.h
+++ b/llvm/lib/CodeGen/AllocationOrder.h
@@ -119,11 +119,10 @@ class LLVM_LIBRARY_VISIBILITY AllocationOrder {
                static_cast<uint32_t>(std::numeric_limits<MCPhysReg>::max()));
     return Reg.isPhysical() && is_contained(Hints, Reg.id());
   }
-  
+
   /// Apply anti-hints to the allocation order.
-  void applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs, 
+  void applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs,
                       const TargetRegisterInfo *TRI);
-
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index d63f8040de331..f1c89d03a3281 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -739,13 +739,12 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
     for (const auto &AntiHintValue : VReg.AntiHints) {
       if (Info.Kind != VRegInfo::NORMAL)
         return error(VReg.Class.SourceRange.Start,
-              Twine("anti-hints can only be set for normal vregs"));
+                     Twine("anti-hints can only be set for normal vregs"));
 
       Register AntiHintReg;
-      if (parseRegisterReference(PFS, AntiHintReg,
-                                 AntiHintValue.Value, Error))
+      if (parseRegisterReference(PFS, AntiHintReg, AntiHintValue.Value, Error))
         return error(Error, AntiHintValue.SourceRange);
-      
+
       Info.AntiHints.push_back(AntiHintReg);
     }
 
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index bbf03830b3bd5..6ecc2119840d1 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -676,16 +676,16 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
   return false;
 }
 
-void MachineRegisterInfo::getPhysRegAntiHints(Register VReg, 
-                                             SmallVectorImpl<MCPhysReg> &PhysAntiHints,
-                                             const VirtRegMap *VRM) const {
+void MachineRegisterInfo::getPhysRegAntiHints(
+    Register VReg, SmallVectorImpl<MCPhysReg> &PhysAntiHints,
+    const VirtRegMap *VRM) const {
   assert(VReg.isVirtual());
   if (!AntiHintRegs.inBounds(VReg) || !VRM)
     return;
-  
+
   const auto &AntiHints = AntiHintRegs[VReg];
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
-  
+
   for (Register AntiHintVReg : AntiHints) {
     // Check if the anti-hinted register has been allocated
     if (VRM->hasPhys(AntiHintVReg)) {
@@ -696,7 +696,7 @@ void MachineRegisterInfo::getPhysRegAntiHints(Register VReg,
       }
     }
   }
-  
+
   // Remove duplicates
   llvm::sort(PhysAntiHints);
   PhysAntiHints.erase(llvm::unique(PhysAntiHints), PhysAntiHints.end());

>From 80287fb35f761e436a71f1264845eff5dac67732 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Tue, 23 Sep 2025 11:48:50 -0400
Subject: [PATCH 18/20] Modified flag name to reflect anti-hints

---
 llvm/include/llvm/CodeGen/MachineRegisterInfo.h    |  4 ++--
 llvm/lib/CodeGen/MachineRegisterInfo.cpp           |  6 ++++--
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp   | 14 ++++++--------
 .../llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir   |  6 ------
 .../AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir  |  2 +-
 5 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 5f00aeebb46fe..6c3d4c9b2515b 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -874,7 +874,7 @@ class MachineRegisterInfo {
     assert(VReg.isVirtual() && "Anti-hints are only for virtual registers");
     assert(AntiHintVReg.isVirtual() && "Anti-hint target must be virtual");
     AntiHintRegs.grow(Register::index2VirtReg(getNumVirtRegs()));
-    auto &AntiHints = AntiHintRegs[VReg];
+    SmallVector<Register, 4> &AntiHints = AntiHintRegs[VReg];
     // Avoid duplicates
     if (llvm::find(AntiHints, AntiHintVReg) == AntiHints.end())
       AntiHints.push_back(AntiHintVReg);
@@ -907,7 +907,7 @@ class MachineRegisterInfo {
     assert(VReg.isVirtual() && AntiHintVReg.isVirtual());
     if (!AntiHintRegs.inBounds(VReg))
       return false;
-    const auto &AntiHints = AntiHintRegs[VReg];
+    const SmallVector<Register, 4> &AntiHints = AntiHintRegs[VReg];
     return llvm::find(AntiHints, AntiHintVReg) != AntiHints.end();
   }
 
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 6ecc2119840d1..0b49fb0a08b94 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -11,12 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -683,7 +685,7 @@ void MachineRegisterInfo::getPhysRegAntiHints(
   if (!AntiHintRegs.inBounds(VReg) || !VRM)
     return;
 
-  const auto &AntiHints = AntiHintRegs[VReg];
+  const SmallVector<Register, 4> &AntiHints = AntiHintRegs[VReg];
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
 
   for (Register AntiHintVReg : AntiHints) {
@@ -700,4 +702,4 @@ void MachineRegisterInfo::getPhysRegAntiHints(
   // Remove duplicates
   llvm::sort(PhysAntiHints);
   PhysAntiHints.erase(llvm::unique(PhysAntiHints), PhysAntiHints.end());
-}
\ No newline at end of file
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 098ca1120c85c..0a08cbbdbf2dc 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -34,7 +34,6 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -44,11 +43,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
 
-static cl::opt<bool> EnableRegisterAvoidListForMFMARegs(
-    "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden,
-    cl::desc("Enable Register Avoidance for "
-             "MFMA in GCNPreRAOptimizations stage."),
-    cl::init(true));
+static cl::opt<bool>
+    EnableAntiHintsForMFMARegs("amdgpu-anti-hints-for-mfma", cl::Hidden,
+                               cl::desc("Enable Anti-Hints for "
+                                        "MFMA in GCNPreRAOptimizations stage."),
+                               cl::init(true));
 
 namespace {
 
@@ -256,10 +255,9 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
   bool Changed = false;
 
   // Single pass implementation
-  if (EnableRegisterAvoidListForMFMARegs && ST.hasMAIInsts()) {
+  if (EnableAntiHintsForMFMARegs && ST.hasMAIInsts()) {
     // Max lookback window for RAW or WAW hazard
     constexpr unsigned MaxLookbackWindow = 19;
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     for (const MachineBasicBlock &MBB : MF) {
 
       SmallVector<std::pair<SlotIndex, SmallVector<Register, 4>>, 16>
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
index 89ac0978a0f72..58c7d71089b1b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
@@ -1,11 +1,5 @@
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -debug -run-pass=greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s
 --- |
-  ; ModuleID = '/work/mdssefat/FullTimeWork/MLSCHED/composable_kernel/noopexample/llvm.amdgcn.mfma.hint.haard.barrier.gfx942_short.mir'
-  source_filename = "/work/mdssefat/FullTimeWork/MLSCHED/composable_kernel/noopexample/llvm.amdgcn.mfma.hint.haard.barrier.gfx942_short.mir"
-  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
-  target triple = "amdgcn-amd-amdhsa"
-
-  ; Function Attrs: nounwind
   define amdgpu_kernel void @test_software_pipelining() #0 {
   bb.0:
     ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
index d360eccaeb773..ba89b09539113 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck %s --check-prefix=CHECK
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck %s --check-prefix=CHECK-NO-ANTIHINT
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-anti-hints-for-mfma=false %s -o - | FileCheck %s --check-prefix=CHECK-NO-ANTIHINT
 
 --- |
   target triple = "amdgcn-amd-amdhsa"

>From 3e745751f91df92d0e233cf3f5817bc48ffad638 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Thu, 25 Sep 2025 17:58:12 -0400
Subject: [PATCH 19/20] [NFC] Restore and remove to move MIR serialization
 changes to separate PR

(cherry picked from commit 7732ae8ae1080ab030db1939141350abc7aa265d)
---
 .../include/llvm/CodeGen/MIRParser/MIParser.h |   1 -
 llvm/include/llvm/CodeGen/MIRYamlMapping.h    |   6 -
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp      |  18 --
 llvm/lib/CodeGen/MIRPrinter.cpp               |  11 --
 ...vm.amdgcn.mfma.anti-hints-parse.gfx942.mir | 171 ------------------
 ...vm.amdgcn.mfma.anti-hints-print.gfx942.mir | 126 -------------
 6 files changed, 333 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
 delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir

diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index cf7a56587397d..0f2898d3554d0 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -45,7 +45,6 @@ struct VRegInfo {
   } D;
   Register VReg;
   Register PreferredReg;
-  SmallVector<Register, 4> AntiHints; // Anti-hints
   uint8_t Flags = 0;
 };
 
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index 9c0056fc03376..e80c13885805b 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -192,7 +192,6 @@ struct VirtualRegisterDefinition {
   StringValue Class;
   StringValue PreferredRegister;
   std::vector<FlowStringValue> RegisterFlags;
-  std::vector<FlowStringValue> AntiHints;
 
   // TODO: Serialize the target specific register hints.
 
@@ -210,11 +209,6 @@ template <> struct MappingTraits<VirtualRegisterDefinition> {
                        StringValue()); // Don't print out when it's empty.
     YamlIO.mapOptional("flags", Reg.RegisterFlags,
                        std::vector<FlowStringValue>());
-    if (!YamlIO.outputting() ||
-        !Reg.AntiHints.empty()) { // Only map when parsing or anti-hints present
-      YamlIO.mapOptional("anti-hints", Reg.AntiHints,
-                         std::vector<FlowStringValue>()); // for anti-hints
-    }
   }
 
   static const bool flow = true;
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index f1c89d03a3281..0f792b0ef206c 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -735,19 +735,6 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
                          FlagStringValue.Value + "'");
       Info.Flags |= FlagValue;
     }
-
-    for (const auto &AntiHintValue : VReg.AntiHints) {
-      if (Info.Kind != VRegInfo::NORMAL)
-        return error(VReg.Class.SourceRange.Start,
-                     Twine("anti-hints can only be set for normal vregs"));
-
-      Register AntiHintReg;
-      if (parseRegisterReference(PFS, AntiHintReg, AntiHintValue.Value, Error))
-        return error(Error, AntiHintValue.SourceRange);
-
-      Info.AntiHints.push_back(AntiHintReg);
-    }
-
     RegInfo.noteNewVirtualRegister(Info.VReg);
   }
 
@@ -814,11 +801,6 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS,
       MRI.setRegClass(Reg, Info.D.RC);
       if (Info.PreferredReg != 0)
         MRI.setSimpleHint(Reg, Info.PreferredReg);
-
-      for (Register AntiHint : Info.AntiHints) {
-        if (AntiHint != 0)
-          MRI.setRegAllocationAntiHint(Reg, AntiHint);
-      }
       break;
     case VRegInfo::GENERIC:
       break;
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index f263bb22e800b..1d54d72336860 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -312,17 +312,6 @@ static void convertMRI(yaml::MachineFunction &YamlMF, const MachineFunction &MF,
     if (PreferredReg)
       printRegMIR(PreferredReg, VReg.PreferredRegister, TRI);
     printRegFlags(Reg, VReg.RegisterFlags, MF, TRI);
-    // Print the anti-hints.
-    const auto &AntiHints = RegInfo.getRegAllocationAntiHints(Reg);
-    if (!AntiHints.empty()) {
-      std::vector<yaml::FlowStringValue> AntiHintStrings;
-      for (Register AntiHint : AntiHints) {
-        yaml::FlowStringValue AntiHintStr;
-        printRegMIR(AntiHint, AntiHintStr, TRI);
-        AntiHintStrings.push_back(std::move(AntiHintStr));
-      }
-      VReg.AntiHints = std::move(AntiHintStrings);
-    }
     YamlMF.VirtualRegisters.push_back(std::move(VReg));
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
deleted file mode 100644
index 58c7d71089b1b..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
+++ /dev/null
@@ -1,171 +0,0 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -debug -run-pass=greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s
---- |
-  define amdgpu_kernel void @test_software_pipelining() #0 {
-  bb.0:
-    ret void
-  }
-
-  attributes #0 = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-waves-per-eu"="2" "frame-pointer"="none" "target-cpu"="gfx942" }
-...
----
-name:            test_software_pipelining
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
-                                                                                '%4', 
-                                                                                '%25', 
-                                                                                '%5', 
-                                                                                '%22', 
-                                                                                '%6', 
-                                                                                '%19', 
-                                                                                '%7', 
-                                                                                '%18', 
-                                                                                '%16', 
-                                                                                '%8' ] }
-  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
-                                                                                '%8', 
-                                                                                '%6', 
-                                                                                '%19', 
-                                                                                '%7', 
-                                                                                '%18', 
-                                                                                '%16' ] }
-  - { id: 2, class: sgpr_128, preferred-register: '', flags: [  ] }
-  - { id: 3, class: vgpr_32, preferred-register: '', flags: [  ] }
-  - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%17', '%1', '%23', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 9, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
-                                                                                '%4', 
-                                                                                '%25', 
-                                                                                '%5', 
-                                                                                '%22', 
-                                                                                '%6', 
-                                                                                '%19', 
-                                                                                '%7', 
-                                                                                '%18', 
-                                                                                '%16', 
-                                                                                '%8' ] }
-  - { id: 10, class: vgpr_32, preferred-register: '', flags: [  ] }
-  - { id: 11, class: vgpr_32, preferred-register: '', flags: [  ] }
-  - { id: 12, class: vgpr_32, preferred-register: '', flags: [  ] }
-  - { id: 13, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 14, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 15, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%8' ] }
-  - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 20, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 21, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%6', '%19', '%7', '%18', '%16', '%8' ] }
-  - { id: 24, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 26, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 27, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 28, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
-                                                                                 '%4', 
-                                                                                 '%25', 
-                                                                                 '%5', 
-                                                                                 '%22', 
-                                                                                 '%6', 
-                                                                                 '%19', 
-                                                                                 '%7', 
-                                                                                 '%18', 
-                                                                                 '%16', 
-                                                                                 '%8' ] }
-  - { id: 29, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
-                                                                                 '%4', 
-                                                                                 '%25', 
-                                                                                 '%5', 
-                                                                                 '%22', 
-                                                                                 '%6', 
-                                                                                 '%19', 
-                                                                                 '%7', 
-                                                                                 '%18', 
-                                                                                 '%16', 
-                                                                                 '%8' ] }
-  - { id: 30, class: vreg_128_align2, preferred-register: '', flags: [  ], 
-      anti-hints: [ '%4', '%25', '%5', '%22', '%6', '%19', '%7', '%18', 
-                    '%16', '%8' ] }
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: test_software_pipelining
-    ; CHECK: renamable $vgpr36 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $vgpr37 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $vgpr38 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $vgpr24 = V_ADD_U32_e32 4096, $vgpr38, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr20 = V_ADD_U32_e32 $vgpr36, killed $vgpr20, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr20 = V_ADD_U32_e32 4096, killed $vgpr20, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr20, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr24, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr24_vgpr25, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128_gfx9 renamable $vgpr37, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; CHECK-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr26_vgpr27, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr20_vgpr21, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr22_vgpr23, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
-    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr24_vgpr25, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 killed renamable $vgpr37, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr26_vgpr27, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr20_vgpr21, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, killed $vgpr22_vgpr23, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr32_vgpr33, killed $vgpr24_vgpr25, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF
-    ; CHECK-NEXT: dead renamable $vgpr20 = V_ADD_U32_e32 killed $vgpr36, killed $vgpr20, implicit $exec
-    ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr38, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; CHECK-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr28_vgpr29_vgpr30_vgpr31, implicit killed renamable $vgpr8_vgpr9_vgpr10_vgpr11, implicit killed renamable $vgpr12_vgpr13_vgpr14_vgpr15, implicit killed renamable $vgpr4_vgpr5_vgpr6_vgpr7, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed renamable $vgpr20_vgpr21_vgpr22_vgpr23
-    %0:vgpr_32 = IMPLICIT_DEF
-    %1:vgpr_32 = IMPLICIT_DEF
-    %2:sgpr_128 = IMPLICIT_DEF
-    %3:vgpr_32 = IMPLICIT_DEF
-    %4:vreg_128_align2 = IMPLICIT_DEF
-    %5:vreg_128_align2 = IMPLICIT_DEF
-    %6:vreg_128_align2 = IMPLICIT_DEF
-    %7:vreg_128_align2 = IMPLICIT_DEF
-    %8:vreg_128_align2 = IMPLICIT_DEF
-    %9:vgpr_32 = IMPLICIT_DEF
-    %10:vgpr_32 = V_ADD_U32_e32 4096, %9, implicit $exec
-    %11:vgpr_32 = V_ADD_U32_e32 %0, %3, implicit $exec
-    %12:vgpr_32 = V_ADD_U32_e32 4096, %11, implicit $exec
-    %13:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %12, %2, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %14:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %10, %2, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %15:vreg_128_align2 = IMPLICIT_DEF
-    %16:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub0_sub1, %14.sub0_sub1, %8, 0, 0, 0, implicit $mode, implicit $exec
-    %17:vreg_128_align2 = DS_READ_B128_gfx9 %1, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    dead %18:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub2_sub3, %14.sub2_sub3, %16, 0, 0, 0, implicit $mode, implicit $exec
-    %19:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub0_sub1, %13.sub0_sub1, %7, 0, 0, 0, implicit $mode, implicit $exec
-    %20:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub2_sub3, %13.sub2_sub3, %19, 0, 0, 0, implicit $mode, implicit $exec
-    %21:vreg_128_align2 = IMPLICIT_DEF
-    %22:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub0_sub1, %14.sub0_sub1, %6, 0, 0, 0, implicit $mode, implicit $exec
-    %23:vreg_128_align2 = DS_READ_B128_gfx9 %1, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    %24:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub2_sub3, %14.sub2_sub3, %22, 0, 0, 0, implicit $mode, implicit $exec
-    %25:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub0_sub1, %13.sub0_sub1, %5, 0, 0, 0, implicit $mode, implicit $exec
-    %26:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub2_sub3, %13.sub2_sub3, %25, 0, 0, 0, implicit $mode, implicit $exec
-    %27:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %17.sub0_sub1, %14.sub0_sub1, %4, 0, 0, 0, implicit $mode, implicit $exec
-    %28:vgpr_32 = IMPLICIT_DEF
-    dead %29:vgpr_32 = V_ADD_U32_e32 %0, %28, implicit $exec
-    %30:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %9, %2, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    S_ENDPGM 0, implicit %23, implicit %24, implicit %20, implicit %26, implicit %27, implicit %30
-...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir
deleted file mode 100644
index c6de026d447fd..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir
+++ /dev/null
@@ -1,126 +0,0 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations %s -o - | FileCheck -check-prefix=CHECK %s
-
---- |
-  target triple = "amdgcn-amd-amdhsa"
-
-  define amdgpu_kernel void @test_software_pipelining() #0 {
-    bb.0:
-      ret void
-  }
-
-  attributes #0 = {nounwind "amdgpu-waves-per-eu"="2"  "amdgpu-agpr-alloc"="0" "frame-pointer"="none"}
-
-...
----
-name:            test_software_pipelining
-body:             |
-  bb.0:
-    ; CHECK-LABEL: name: test_software_pipelining
-    ; CHECK: registers:
-    ; CHECK:  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%4',
-    ; CHECK:{{\s*}}'%8' ] }
-    ; CHECK:  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%8',
-    ; CHECK:{{\s*}}'%16' ] }
-    ; CHECK:  - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
-    ; CHECK:  - { id: 5, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
-    ; CHECK:  - { id: 6, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
-    ; CHECK:  - { id: 7, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
-    ; CHECK:  - { id: 8, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%17'{{.*}}'%9' ] }
-    ; CHECK:  - { id: 9, class: vgpr_32, preferred-register:  '', flags: [  ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%4',
-    ; CHECK:{{\s*}}'%8' ] }
-    ; CHECK:  - { id: 16, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
-    ; CHECK:  - { id: 17, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%8' ] }
-    ; CHECK:  - { id: 18, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
-    ; CHECK:  - { id: 19, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
-    ; CHECK:  - { id: 22, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
-    ; CHECK:  - { id: 23, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%6'{{.*}}'%8' ] }
-    ; CHECK:  - { id: 25, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
-    ; CHECK:  - { id: 28, class: vgpr_32, preferred-register:  '', flags: [  ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%4',
-    ; CHECK:{{\s*}}'%8' ] }
-    ; CHECK:  - { id: 29, class: vgpr_32, preferred-register:  '', flags: [ ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%4',
-    ; CHECK:{{\s*}}'%8' ] }
-    ; CHECK:  - { id: 30, class: vreg_128_align2
-    ; CHECK-NEXT: {{.*}}anti-hints: [ '%4'
-    ; CHECK: {{.*}}'%8' ] }
-    ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 4096, [[DEF9]], implicit $exec
-    ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF]], [[DEF3]], implicit $exec
-    ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 4096, [[V_ADD_U32_e32_1]], implicit $exec
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_2]], [[DEF2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_]], [[DEF2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF8]], 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF1]], 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; CHECK-NEXT: dead [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_]], 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[DEF7]], 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF6]], 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF1]], 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4]], 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[DEF5]], 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6]], 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF4]], 0, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: dead [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF]], [[DEF12]], implicit $exec
-    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF9]], [[DEF2]], 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[DS_READ_B128_gfx9_1]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8]], implicit [[BUFFER_LOAD_DWORDX4_OFFEN2]]
-    %3:vgpr_32 = IMPLICIT_DEF
-    %4:vgpr_32 = IMPLICIT_DEF
-    %6:sgpr_128 = IMPLICIT_DEF
-    %7:vgpr_32 = IMPLICIT_DEF
-    %19:vreg_128_align2 = IMPLICIT_DEF
-    %20:vreg_128_align2 = IMPLICIT_DEF
-    %21:vreg_128_align2 = IMPLICIT_DEF
-    %22:vreg_128_align2 = IMPLICIT_DEF
-    %23:vreg_128_align2 = IMPLICIT_DEF
-    %25:vgpr_32 = IMPLICIT_DEF
-    %24:vgpr_32 = V_ADD_U32_e32 4096, %25:vgpr_32, implicit $exec
-    %27:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %7:vgpr_32, implicit $exec
-    %26:vgpr_32 = V_ADD_U32_e32 4096, %27:vgpr_32, implicit $exec
-    %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %31:vreg_128_align2 = IMPLICIT_DEF
-    %30:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %23:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %32:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %30:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %22:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %34:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %37:vreg_128_align2 = IMPLICIT_DEF
-    %36:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %21:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %38:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %36:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %20:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %40:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %19:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %43:vgpr_32 = IMPLICIT_DEF
-    %925:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %43:vgpr_32, implicit $exec
-    %44:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    S_ENDPGM 0, implicit %38, implicit %39, implicit %35, implicit %41, implicit %42, implicit %44
-...

>From 837a3a97d6ad16c9bc4ad345b516d78240295680 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 26 Sep 2025 12:19:21 -0400
Subject: [PATCH 20/20] Named operand and stable partition applied

(cherry picked from commit ee6d876fcc3d84d6ea3a68b3eee1ce97e714b6e6)
---
 .../llvm/CodeGen/MachineRegisterInfo.h        |   10 +-
 llvm/lib/CodeGen/AllocationOrder.cpp          |   39 +-
 llvm/lib/CodeGen/AllocationOrder.h            |    2 +
 llvm/lib/CodeGen/MachineRegisterInfo.cpp      |   15 +-
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   |   53 +-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir |  301 ++---
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll |   92 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll |  120 +-
 .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll    |   64 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll |  312 ++---
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       | 1071 ++++++-----------
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       |   91 +-
 12 files changed, 890 insertions(+), 1280 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 6c3d4c9b2515b..0cfb8454dcd99 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -867,13 +867,13 @@ class MachineRegisterInfo {
     return RegAllocHints.inBounds(VReg) ? &RegAllocHints[VReg] : nullptr;
   }
 
-  /// setRegAllocationAntiHint - Add a register allocation anti-hint for the
+  /// addRegAllocAntiHint - Add a register allocation anti-hint for the
   /// specified virtual register. This tells the allocator to avoid allocating
   /// VReg to the same physical register as AntiHintVReg (or overlapping ones).
-  void setRegAllocationAntiHint(Register VReg, Register AntiHintVReg) {
+  void addRegAllocAntiHint(Register VReg, Register AntiHintVReg) {
     assert(VReg.isVirtual() && "Anti-hints are only for virtual registers");
     assert(AntiHintVReg.isVirtual() && "Anti-hint target must be virtual");
-    AntiHintRegs.grow(Register::index2VirtReg(getNumVirtRegs()));
+    AntiHintRegs.grow(VReg);
     SmallVector<Register, 4> &AntiHints = AntiHintRegs[VReg];
     // Avoid duplicates
     if (llvm::find(AntiHints, AntiHintVReg) == AntiHints.end())
@@ -884,7 +884,7 @@ class MachineRegisterInfo {
   void addRegAllocationAntiHints(Register VReg,
                                  ArrayRef<Register> AntiHintVRegs) {
     for (Register AntiHint : AntiHintVRegs)
-      setRegAllocationAntiHint(VReg, AntiHint);
+      addRegAllocAntiHint(VReg, AntiHint);
   }
 
   /// clearRegAllocationAntiHints - Clear all anti-hints for a register.
@@ -915,7 +915,7 @@ class MachineRegisterInfo {
   /// VRM is the current virtual register map showing allocations made so far.
   void getPhysRegAntiHints(Register VReg,
                            SmallVectorImpl<MCPhysReg> &PhysAntiHints,
-                           const VirtRegMap *VRM) const;
+                           const VirtRegMap &VRM) const;
 
   /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the
   /// specified register as undefined which causes the DBG_VALUE to be
diff --git a/llvm/lib/CodeGen/AllocationOrder.cpp b/llvm/lib/CodeGen/AllocationOrder.cpp
index 8550759f97e8a..32005fd6ff837 100644
--- a/llvm/lib/CodeGen/AllocationOrder.cpp
+++ b/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -48,7 +48,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
 
   // Get anti-hints
   SmallVector<MCPhysReg, 16> AntiHintedPhysRegs;
-  MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, &VRM);
+  MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, VRM);
 
   LLVM_DEBUG({
     if (!AntiHintedPhysRegs.empty()) {
@@ -84,29 +84,34 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
 
 void AllocationOrder::applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs,
                                      const TargetRegisterInfo *TRI) {
+  // Helper to check if a register overlaps with any anti-hint
+  auto isAntiHinted = [&](MCPhysReg Reg) {
+    return std::any_of(
+        AntiHintedPhysRegs.begin(), AntiHintedPhysRegs.end(),
+        [&](MCPhysReg AntiHint) { return TRI->regsOverlap(Reg, AntiHint); });
+  };
+
   // Create filtered order
   FilteredOrderStorage.clear();
-  FilteredOrderStorage.reserve(Order.size());
-
-  // Add non-anti-hinted registers first
-  for (MCPhysReg PhysReg : Order) {
-    if (!is_contained(AntiHintedPhysRegs, PhysReg)) {
-      FilteredOrderStorage.push_back(PhysReg);
-    }
-  }
+  FilteredOrderStorage.assign(Order.begin(), Order.end());
 
-  // Add anti-hinted registers at the end as last resort
-  for (MCPhysReg PhysReg : Order) {
-    if (is_contained(AntiHintedPhysRegs, PhysReg)) {
-      FilteredOrderStorage.push_back(PhysReg);
-    }
-  }
+  // Partition: non-anti-hinted registers go first
+  auto PartitionPoint = std::stable_partition(
+      FilteredOrderStorage.begin(), FilteredOrderStorage.end(),
+      [&](MCPhysReg Reg) { return !isAntiHinted(Reg); });
 
   // Update Order
   Order = FilteredOrderStorage;
 
   LLVM_DEBUG({
-    dbgs() << "moved " << AntiHintedPhysRegs.size()
-           << " anti-hinted registers to end of allocation order\n";
+    size_t NonAntiHintedCount =
+        std::distance(FilteredOrderStorage.begin(), PartitionPoint);
+    size_t AntiHintedCount =
+        std::distance(PartitionPoint, FilteredOrderStorage.end());
+    dbgs() << "    Added " << NonAntiHintedCount
+           << " non-anti-hinted registers first\n"
+           << "    Added " << AntiHintedCount
+           << " anti-hinted registers at the end\n"
+           << "  Anti-hint filtering complete\n";
   });
 }
diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h
index cda5fd08e0af6..0b10272731444 100644
--- a/llvm/lib/CodeGen/AllocationOrder.h
+++ b/llvm/lib/CodeGen/AllocationOrder.h
@@ -30,6 +30,8 @@ class LiveRegMatrix;
 
 class LLVM_LIBRARY_VISIBILITY AllocationOrder {
   const SmallVector<MCPhysReg, 16> Hints;
+  // Used as storage if the Order received in the constructor needs to be
+  // altered.
   SmallVector<MCPhysReg, 16> FilteredOrderStorage;
   ArrayRef<MCPhysReg> Order;
   // How far into the Order we can iterate. This is 0 if the AllocationOrder is
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 0b49fb0a08b94..1cd74d3561b2b 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -680,22 +680,19 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
 
 void MachineRegisterInfo::getPhysRegAntiHints(
     Register VReg, SmallVectorImpl<MCPhysReg> &PhysAntiHints,
-    const VirtRegMap *VRM) const {
+    const VirtRegMap &VRM) const {
   assert(VReg.isVirtual());
-  if (!AntiHintRegs.inBounds(VReg) || !VRM)
+  if (!AntiHintRegs.inBounds(VReg))
     return;
 
   const SmallVector<Register, 4> &AntiHints = AntiHintRegs[VReg];
-  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
 
   for (Register AntiHintVReg : AntiHints) {
     // Check if the anti-hinted register has been allocated
-    if (VRM->hasPhys(AntiHintVReg)) {
-      MCPhysReg PhysReg = VRM->getPhys(AntiHintVReg);
-      // Add the physical register and all its aliases
-      for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) {
-        PhysAntiHints.push_back(*AI);
-      }
+    if (VRM.hasPhys(AntiHintVReg)) {
+      MCPhysReg PhysReg = VRM.getPhys(AntiHintVReg);
+      // Add the physical register
+      PhysAntiHints.push_back(PhysReg);
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 0a08cbbdbf2dc..dde4a84d45680 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -37,6 +37,7 @@
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
@@ -253,37 +254,45 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
   TRI = ST.getRegisterInfo();
 
   bool Changed = false;
-
-  // Single pass implementation
+  // Add RA anti-hints to reduce MFMA hazard NOPs
   if (EnableAntiHintsForMFMARegs && ST.hasMAIInsts()) {
     // Max lookback window for RAW or WAW hazard
     constexpr unsigned MaxLookbackWindow = 19;
     for (const MachineBasicBlock &MBB : MF) {
-
-      SmallVector<std::pair<SlotIndex, SmallVector<Register, 4>>, 16>
-          RecentMFMAs;
+      SmallVector<SmallVector<Register, 4>, 16> RecentMFMAs;
       for (const MachineInstr &MI : MBB) {
         if (MI.isDebugInstr())
           continue;
-        const SlotIndex CurrentSlot = LIS->getInstructionIndex(MI).getRegSlot();
+
         // Handle MFMA instructions
         if (SIInstrInfo::isMFMA(MI)) {
           SmallVector<Register, 4> MFMARegisters;
-          auto collectMFMARegister = [&](unsigned OpIdx) {
-            if (OpIdx >= MI.getNumOperands())
+          // Helper to get named operand
+          auto collectNamedOperand = [&](AMDGPU::OpName OpName,
+                                         const char *OpNameStr) {
+            const MachineOperand *MO = TII->getNamedOperand(MI, OpName);
+            if (!MO) {
+              LLVM_DEBUG(dbgs() << "    Named operand " << OpNameStr
+                                << " not found\n");
               return;
-
-            const MachineOperand &MO = MI.getOperand(OpIdx);
-            if (MO.isReg() && MO.getReg().isVirtual())
-              MFMARegisters.push_back(MO.getReg());
+            }
+            if (MO->isReg() && MO->getReg().isVirtual()) {
+              Register Reg = MO->getReg();
+              const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+              // Only consider VGPRs
+              if (TRI->hasVGPRs(RC))
+                MFMARegisters.push_back(Reg);
+              LLVM_DEBUG(dbgs() << "    Collected " << OpNameStr << " : "
+                                << printReg(Reg, TRI) << "\n");
+            }
           };
-          // Only collect Matrix C (operand 3) and destination (operand 0)
-          // registers
-          collectMFMARegister(0);
-          collectMFMARegister(3);
 
+          // Collect destination and source C registers
+          collectNamedOperand(AMDGPU::OpName::vdst, "vdst"); // Destination
+          collectNamedOperand(AMDGPU::OpName::src2,
+                              "src2"); // Matrix C (accumulator)
           if (!MFMARegisters.empty()) {
-            RecentMFMAs.emplace_back(CurrentSlot, std::move(MFMARegisters));
+            RecentMFMAs.emplace_back(std::move(MFMARegisters));
             // Maintain window
             if (RecentMFMAs.size() > MaxLookbackWindow)
               RecentMFMAs.erase(RecentMFMAs.begin());
@@ -309,17 +318,13 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
           // Only process VGPR registers
           if (!TRI->isVGPRClass(CandidateRC))
             continue;
-
           for (auto It = RecentMFMAs.rbegin(); It != RecentMFMAs.rend(); ++It) {
-            const SmallVector<Register, 4> &MFMARegs = It->second;
+            const SmallVector<Register, 4> &MFMARegs = *It;
             for (Register MFMAReg : MFMARegs) {
-              // Verify register class compatibility
-              const TargetRegisterClass *MFMARC = MRI->getRegClass(MFMAReg);
-              if (!TRI->hasVGPRs(MFMARC))
-                continue;
-
               // Check if MFMA register is dead at current instruction
               const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg);
+              const SlotIndex CurrentSlot =
+                  LIS->getInstructionIndex(MI).getRegSlot();
               if (!MFMAInterval.liveAt(CurrentSlot)) {
                 // Add bi-directional anti-hints
                 MRI->addRegAllocationAntiHints(CandidateReg, MFMAReg);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index 8fbfe2e591dfe..e13b34a6b1f19 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -6,27 +6,11 @@
   define amdgpu_kernel void @largeInterleave() #0 { ret void }
   ; GCN-LABEL: largeInterleave:
   ; GCN:       ; %bb.0:
-  ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-  ; GCN-NEXT:    ; implicit-def: $vgpr0
-  ; GCN-NEXT:    ; implicit-def: $vgpr2
-  ; GCN-NEXT:    ; implicit-def: $vgpr1
-  ; GCN-NEXT:    ; implicit-def: $vgpr8
-  ; GCN-NEXT:    ; implicit-def: $vgpr94
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    ; implicit-def: $vgpr106
-  ; GCN-NEXT:    ; implicit-def: $vgpr132
-  ; GCN-NEXT:    ; implicit-def: $vgpr112
-  ; GCN-NEXT:    ; implicit-def: $vgpr113
-  ; GCN-NEXT:    ; implicit-def: $vgpr114
-  ; GCN-NEXT:    ; implicit-def: $vgpr115
-  ; GCN-NEXT:    ; implicit-def: $vgpr133
-  ; GCN-NEXT:    ; implicit-def: $vgpr139
-  ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
-  ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    ; implicit-def: $vgpr16
-  ; GCN-NEXT:    ; implicit-def: $vgpr25
+  ; GCN-NEXT:    ; implicit-def: $vgpr20
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
   ; GCN-NEXT:    v_readfirstlane_b32 s17, v16
+  ; GCN-NEXT:    ; implicit-def: $vgpr48
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   ; GCN-NEXT:    ; implicit-def: $vgpr17
   ; GCN-NEXT:    ; implicit-def: $sgpr15
@@ -34,9 +18,10 @@
   ; GCN-NEXT:    s_lshl_b32 s18, s17, 7
   ; GCN-NEXT:    ; implicit-def: $vgpr18
   ; GCN-NEXT:    v_add_lshl_u32 v230, v18, s18, 1
-  ; GCN-NEXT:    v_lshl_add_u32 v25, s17, 4, v25
-  ; GCN-NEXT:    v_mul_lo_u32 v25, v25, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v226, v25, v17, 1
+  ; GCN-NEXT:    v_add_u32_e32 v16, s17, v20
+  ; GCN-NEXT:    v_lshl_add_u32 v20, s17, 4, v48
+  ; GCN-NEXT:    v_mul_lo_u32 v20, v20, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v226, v20, v17, 1
   ; GCN-NEXT:    v_add_u32_e32 v17, s15, v226
   ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v226, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -52,43 +37,41 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr156_vgpr157_vgpr158_vgpr159
   ; GCN-NEXT:    ; implicit-def: $vgpr144_vgpr145_vgpr146_vgpr147
   ; GCN-NEXT:    ; implicit-def: $vgpr19
-  ; GCN-NEXT:    ; implicit-def: $vgpr26
-  ; GCN-NEXT:    ; implicit-def: $vgpr27
+  ; GCN-NEXT:    ; implicit-def: $vgpr49
+  ; GCN-NEXT:    ; implicit-def: $vgpr50
   ; GCN-NEXT:    v_add_u32_e32 v227, 0xc0, v17
-  ; GCN-NEXT:    v_add_u32_e32 v231, v19, v26
-  ; GCN-NEXT:    v_add_u32_e32 v232, v19, v27
+  ; GCN-NEXT:    v_add_u32_e32 v231, v19, v49
+  ; GCN-NEXT:    v_add_u32_e32 v232, v19, v50
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:    ; implicit-def: $vgpr28
-  ; GCN-NEXT:    ; implicit-def: $vgpr29
-  ; GCN-NEXT:    v_add_u32_e32 v233, v19, v28
-  ; GCN-NEXT:    v_add_u32_e32 v234, v19, v29
+  ; GCN-NEXT:    ; implicit-def: $vgpr51
+  ; GCN-NEXT:    ; implicit-def: $vgpr52
+  ; GCN-NEXT:    v_add_u32_e32 v233, v19, v51
+  ; GCN-NEXT:    v_add_u32_e32 v234, v19, v52
   ; GCN-NEXT:    ; implicit-def: $vgpr140_vgpr141_vgpr142_vgpr143
   ; GCN-NEXT:    ; implicit-def: $sgpr5
   ; GCN-NEXT:    ; implicit-def: $sgpr7
   ; GCN-NEXT:    ; implicit-def: $vgpr148_vgpr149_vgpr150_vgpr151
   ; GCN-NEXT:    ; implicit-def: $vgpr136_vgpr137_vgpr138_vgpr139
   ; GCN-NEXT:    ; implicit-def: $vgpr132_vgpr133_vgpr134_vgpr135
-  ; GCN-NEXT:    ; implicit-def: $vgpr20
-  ; GCN-NEXT:    v_add_u32_e32 v18, s17, v20
-  ; GCN-NEXT:    v_and_b32_e32 v18, 0x1fffffff, v18
+  ; GCN-NEXT:    v_and_b32_e32 v16, 0x1fffffff, v16
   ; GCN-NEXT:    ; implicit-def: $sgpr16
-  ; GCN-NEXT:    v_mul_lo_u32 v18, v18, s16
+  ; GCN-NEXT:    v_mul_lo_u32 v16, v16, s16
   ; GCN-NEXT:    ; implicit-def: $vgpr21
-  ; GCN-NEXT:    v_add_lshl_u32 v199, v21, v18, 1
+  ; GCN-NEXT:    v_add_lshl_u32 v199, v21, v16, 1
   ; GCN-NEXT:    ; implicit-def: $vgpr22
   ; GCN-NEXT:    v_lshl_add_u32 v200, v22, 1, v199
   ; GCN-NEXT:    ; implicit-def: $vgpr23
   ; GCN-NEXT:    v_lshl_add_u32 v201, v23, 1, v200
   ; GCN-NEXT:    ; implicit-def: $vgpr24
   ; GCN-NEXT:    v_lshl_add_u32 v202, v24, 1, v201
+  ; GCN-NEXT:    ; implicit-def: $vgpr53
+  ; GCN-NEXT:    ; implicit-def: $vgpr54
+  ; GCN-NEXT:    ; implicit-def: $vgpr55
   ; GCN-NEXT:    ; implicit-def: $vgpr16
-  ; GCN-NEXT:    ; implicit-def: $vgpr18
-  ; GCN-NEXT:    ; implicit-def: $vgpr20
-  ; GCN-NEXT:    ; implicit-def: $vgpr24
-  ; GCN-NEXT:    v_add_u32_e32 v247, v19, v24
-  ; GCN-NEXT:    v_add_u32_e32 v248, v19, v16
-  ; GCN-NEXT:    v_add_u32_e32 v249, v19, v18
-  ; GCN-NEXT:    v_add_u32_e32 v250, v19, v20
+  ; GCN-NEXT:    v_add_u32_e32 v247, v19, v16
+  ; GCN-NEXT:    v_add_u32_e32 v248, v19, v53
+  ; GCN-NEXT:    v_add_u32_e32 v249, v19, v54
+  ; GCN-NEXT:    v_add_u32_e32 v250, v19, v55
   ; GCN-NEXT:    ; implicit-def: $vgpr128_vgpr129_vgpr130_vgpr131
   ; GCN-NEXT:    ; implicit-def: $sgpr14
   ; GCN-NEXT:    ; implicit-def: $vgpr196
@@ -97,23 +80,23 @@
   ; GCN-NEXT:    v_max_f32_e32 v212, v211, v211
   ; GCN-NEXT:    ; implicit-def: $vgpr198
   ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-  ; GCN-NEXT:    ; implicit-def: $vgpr32
-  ; GCN-NEXT:    ; implicit-def: $vgpr33
-  ; GCN-NEXT:    ; implicit-def: $vgpr34
-  ; GCN-NEXT:    v_add_u32_e32 v210, v19, v34
-  ; GCN-NEXT:    v_add_u32_e32 v206, v19, v33
-  ; GCN-NEXT:    v_add_u32_e32 v205, v19, v32
   ; GCN-NEXT:    ; implicit-def: $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47
-  ; GCN-NEXT:    ; implicit-def: $vgpr21
-  ; GCN-NEXT:    ; implicit-def: $vgpr22
-  ; GCN-NEXT:    ; implicit-def: $vgpr23
-  ; GCN-NEXT:    ; implicit-def: $vgpr30
-  ; GCN-NEXT:    ; implicit-def: $vgpr31
-  ; GCN-NEXT:    v_add_u32_e32 v207, v19, v21
-  ; GCN-NEXT:    v_add_u32_e32 v208, v19, v22
-  ; GCN-NEXT:    v_add_u32_e32 v209, v19, v23
-  ; GCN-NEXT:    v_add_u32_e32 v203, v19, v30
-  ; GCN-NEXT:    v_add_u32_e32 v204, v19, v31
+  ; GCN-NEXT:    ; implicit-def: $vgpr56
+  ; GCN-NEXT:    ; implicit-def: $vgpr57
+  ; GCN-NEXT:    ; implicit-def: $vgpr58
+  ; GCN-NEXT:    ; implicit-def: $vgpr59
+  ; GCN-NEXT:    ; implicit-def: $vgpr60
+  ; GCN-NEXT:    ; implicit-def: $vgpr61
+  ; GCN-NEXT:    ; implicit-def: $vgpr62
+  ; GCN-NEXT:    ; implicit-def: $vgpr18
+  ; GCN-NEXT:    v_add_u32_e32 v210, v19, v18
+  ; GCN-NEXT:    v_add_u32_e32 v207, v19, v56
+  ; GCN-NEXT:    v_add_u32_e32 v208, v19, v57
+  ; GCN-NEXT:    v_add_u32_e32 v209, v19, v58
+  ; GCN-NEXT:    v_add_u32_e32 v206, v19, v62
+  ; GCN-NEXT:    v_add_u32_e32 v203, v19, v59
+  ; GCN-NEXT:    v_add_u32_e32 v204, v19, v60
+  ; GCN-NEXT:    v_add_u32_e32 v205, v19, v61
   ; GCN-NEXT:    ; kill: killed $vgpr17
   ; GCN-NEXT:    ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
   ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
@@ -219,45 +202,30 @@
   ; GCN-NEXT:    ds_write_b128 v230, v[152:155]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
   ; GCN-NEXT:    ds_write_b128 v230, v[160:163] offset:1024
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_load_dwordx4 v[152:155], v226, s[8:11], 0 offen offset:192 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
-  ; GCN-NEXT:    v_add_u32_e32 v73, v132, v112
-  ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[184:185], v[156:157], v[64:79]
   ; GCN-NEXT:    buffer_load_dwordx4 v[226:229], v227, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; kill: killed $vgpr72
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v113
-  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v73, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[160:161], v231, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[162:163], v232, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v114
-  ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[172:173], v233, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v115
-  ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[174:175], v234, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; kill: killed $vgpr73
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[186:187], v[158:159], v[64:79]
   ; GCN-NEXT:    v_perm_b32 v238, v162, v160, s5
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[218:219], v[140:141], v[112:127]
@@ -267,14 +235,6 @@
   ; GCN-NEXT:    ds_read_b128 v[160:163], v213
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
-  ; GCN-NEXT:    ; implicit-def: $sgpr8
-  ; GCN-NEXT:    ; implicit-def: $vgpr112
-  ; GCN-NEXT:    ; implicit-def: $vgpr113
-  ; GCN-NEXT:    ; implicit-def: $vgpr114
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
   ; GCN-NEXT:    v_perm_b32 v239, v174, v172, s5
   ; GCN-NEXT:    v_perm_b32 v241, v174, v172, s7
   ; GCN-NEXT:    v_perm_b32 v243, v175, v173, s5
@@ -1060,8 +1020,8 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[134:135], v[64:65], v[32:47]
   ; GCN-NEXT:    v_exp_f32_e32 v77, v147
   ; GCN-NEXT:    v_pack_b32_f16 v134, v66, v68
-  ; GCN-NEXT:    v_fma_f32 v68, s4, v78, -v128
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v66, v74
+  ; GCN-NEXT:    v_fma_f32 v68, s4, v78, -v128
   ; GCN-NEXT:    v_mul_f32_e32 v147, 0x3fb8aa3b, v69
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[138:139], v[64:65], v[16:31]
   ; GCN-NEXT:    v_exp_f32_e32 v78, v67
@@ -1090,103 +1050,104 @@
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v78
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[144:145], v[134:135], v[48:63]
   ; GCN-NEXT:    s_nop 10
-  ; GCN-NEXT:    v_exp_f32_e32 v52, v128
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v50, v137
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v48, v137
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v51, v142
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v54, v138
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v53, v52
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v49, v79
-  ; GCN-NEXT:    v_pack_b32_f16 v50, v51, v50
-  ; GCN-NEXT:    v_pack_b32_f16 v48, v139, v136
-  ; GCN-NEXT:    v_pack_b32_f16 v51, v54, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, 0, v113
-  ; GCN-NEXT:    v_add_f32_e32 v53, v114, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v115, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v116, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v117, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v118, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v119, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v120, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v121, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v122, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v123, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v124, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v96, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v97, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v98, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v99, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v100, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v101, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v102, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v103, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v104, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v105, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v106, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v107, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v108, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v109, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v110, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v111, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v80, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v81, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v82, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v83, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v84, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v85, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v86, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v87, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v88, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v89, v53
-  ; GCN-NEXT:    v_pack_b32_f16 v49, v140, v49
-  ; GCN-NEXT:    v_add_f32_e32 v53, v90, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v91, v53
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[48:49], v[0:15]
-  ; GCN-NEXT:    v_add_f32_e32 v53, v92, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v93, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v94, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v95, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v125, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v126, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v127, v53
-  ; GCN-NEXT:    v_add_f32_e32 v53, v129, v53
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[50:51], v[0:15]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[48:49], v[32:47]
+  ; GCN-NEXT:    v_exp_f32_e32 v50, v128
+  ; GCN-NEXT:    v_pack_b32_f16 v52, v139, v136
+  ; GCN-NEXT:    v_pack_b32_f16 v48, v51, v48
+  ; GCN-NEXT:    v_add_f32_e32 v51, 0, v113
+  ; GCN-NEXT:    v_add_f32_e32 v51, v114, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v115, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v116, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v117, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v118, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v119, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v120, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v121, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v122, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v123, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v124, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v96, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v97, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v98, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v99, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v100, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v101, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v102, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v103, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v104, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v105, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v106, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v107, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v108, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v109, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v110, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v111, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v80, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v81, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v82, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v83, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v84, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v85, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v86, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v87, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v88, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v89, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v90, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v91, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v92, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v93, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v94, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v95, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v125, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v126, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v127, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v129, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v130, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v131, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v132, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v133, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v72, v51
+  ; GCN-NEXT:    v_pack_b32_f16 v53, v140, v49
+  ; GCN-NEXT:    v_add_f32_e32 v51, v73, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v74, v51
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[52:53], v[0:15]
+  ; GCN-NEXT:    v_add_f32_e32 v51, v75, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v76, v51
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v54, v50
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v55, v138
+  ; GCN-NEXT:    v_add_f32_e32 v51, v77, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v78, v51
+  ; GCN-NEXT:    v_add_f32_e32 v51, v79, v51
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[64:65], v[52:53], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v51, v142, v51
+  ; GCN-NEXT:    v_pack_b32_f16 v49, v55, v54
+  ; GCN-NEXT:    v_add_f32_e32 v51, v137, v51
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[70:71], v[48:49], v[0:15]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[48:49], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v48, v138, v51
+  ; GCN-NEXT:    v_add_f32_e32 v54, v50, v48
+  ; GCN-NEXT:    ds_bpermute_b32 v55, v196, v54
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    ds_read_b128 v[48:51], v197 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_add_f32_e32 v50, v54, v55
+  ; GCN-NEXT:    ds_bpermute_b32 v51, v196, v50
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[48:49], v[52:53], v[16:31]
+  ; GCN-NEXT:    ; implicit-def: $vgpr54
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    s_nop 9
-  ; GCN-NEXT:    v_add_f32_e32 v0, v130, v53
-  ; GCN-NEXT:    v_add_f32_e32 v0, v131, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v132, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v133, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v72, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v73, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v74, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v75, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v76, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v77, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v78, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v79, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v142, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v137, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v138, v0
-  ; GCN-NEXT:    v_add_f32_e32 v4, v52, v0
-  ; GCN-NEXT:    ds_bpermute_b32 v5, v196, v4
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v197 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[0:1], v[48:49], v[16:31]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v4, v5
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v196, v2
-  ; GCN-NEXT:    ; implicit-def: $vgpr4
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[12:13]
-  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v112
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v197 offset:1728
+  ; GCN-NEXT:    v_cndmask_b32_e64 v16, v51, v50, s[12:13]
+  ; GCN-NEXT:    v_fmac_f32_e32 v16, v54, v112
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v197 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[66:67], v[50:51], v[32:47]
   ; GCN-NEXT:    s_endpgm
 
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 17692a38dfc64..16ea95437881b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -731,10 +731,10 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], 0
 ; GFX90A-VGPR-NEXT:    s_nop 3
-; GFX90A-VGPR-NEXT:    v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-VGPR-NEXT:    v_mfma_f64_4x4x4f64 v[2:3], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-VGPR-NEXT:    s_nop 7
-; GFX90A-VGPR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_4x4x4f64:
@@ -747,10 +747,10 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], 0
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-VGPR-NEXT:    v_mfma_f64_4x4x4_4b_f64 v[2:3], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-VGPR-NEXT:    s_nop 7
-; GFX942-VGPR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx2 v4, v[2:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
@@ -1629,20 +1629,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v10, s2
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v11, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v16, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v17, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
@@ -1657,20 +1657,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
@@ -1743,20 +1743,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v10, s2
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v11, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v16, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v17, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
@@ -1771,20 +1771,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v16, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v17, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index 07a4f33f25b17..2fb677eccc4b3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -2460,7 +2460,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -2481,11 +2480,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    s_nop 10
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
@@ -2525,7 +2525,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -2546,11 +2545,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    s_nop 11
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 10
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
@@ -3607,7 +3607,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3628,11 +3627,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    s_nop 10
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
@@ -3672,7 +3672,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3693,11 +3692,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    s_nop 11
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 10
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
@@ -3910,7 +3910,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3931,11 +3930,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    s_nop 10
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
@@ -3975,7 +3975,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3996,11 +3995,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    s_nop 11
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 10
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
@@ -4213,7 +4213,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4234,11 +4233,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    s_nop 10
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
@@ -4278,7 +4278,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4299,11 +4298,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    s_nop 11
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 10
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
@@ -4516,7 +4516,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4537,11 +4536,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    s_nop 10
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 9
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
@@ -4581,7 +4581,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4602,11 +4601,12 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    s_nop 11
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 10
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 13a96cfa6e650..ceeb00ba55197 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -269,26 +269,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
 ; GCN-NEXT:    v_mov_b32_e32 v35, s23
 ; GCN-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    v_mov_b32_e32 v16, s16
-; GCN-NEXT:    v_mov_b32_e32 v17, s17
-; GCN-NEXT:    v_mov_b32_e32 v18, s18
-; GCN-NEXT:    v_mov_b32_e32 v19, s19
-; GCN-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v32, s16
+; GCN-NEXT:    v_mov_b32_e32 v33, s17
+; GCN-NEXT:    v_mov_b32_e32 v34, s18
+; GCN-NEXT:    v_mov_b32_e32 v35, s19
+; GCN-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-NEXT:    v_mov_b32_e32 v17, s13
-; GCN-NEXT:    v_mov_b32_e32 v18, s14
-; GCN-NEXT:    v_mov_b32_e32 v19, s15
-; GCN-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s12
+; GCN-NEXT:    v_mov_b32_e32 v33, s13
+; GCN-NEXT:    v_mov_b32_e32 v34, s14
+; GCN-NEXT:    v_mov_b32_e32 v35, s15
+; GCN-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s8
-; GCN-NEXT:    v_mov_b32_e32 v17, s9
-; GCN-NEXT:    v_mov_b32_e32 v18, s10
-; GCN-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s8
+; GCN-NEXT:    v_mov_b32_e32 v33, s9
+; GCN-NEXT:    v_mov_b32_e32 v34, s10
+; GCN-NEXT:    v_mov_b32_e32 v35, s11
+; GCN-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -332,26 +332,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
 ; GCN-NEXT:    v_mov_b32_e32 v35, s23
 ; GCN-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    v_mov_b32_e32 v16, s16
-; GCN-NEXT:    v_mov_b32_e32 v17, s17
-; GCN-NEXT:    v_mov_b32_e32 v18, s18
-; GCN-NEXT:    v_mov_b32_e32 v19, s19
-; GCN-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_mov_b32_e32 v32, s16
+; GCN-NEXT:    v_mov_b32_e32 v33, s17
+; GCN-NEXT:    v_mov_b32_e32 v34, s18
+; GCN-NEXT:    v_mov_b32_e32 v35, s19
+; GCN-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-NEXT:    v_mov_b32_e32 v17, s13
-; GCN-NEXT:    v_mov_b32_e32 v18, s14
-; GCN-NEXT:    v_mov_b32_e32 v19, s15
-; GCN-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s12
+; GCN-NEXT:    v_mov_b32_e32 v33, s13
+; GCN-NEXT:    v_mov_b32_e32 v34, s14
+; GCN-NEXT:    v_mov_b32_e32 v35, s15
+; GCN-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s8
-; GCN-NEXT:    v_mov_b32_e32 v17, s9
-; GCN-NEXT:    v_mov_b32_e32 v18, s10
-; GCN-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s8
+; GCN-NEXT:    v_mov_b32_e32 v33, s9
+; GCN-NEXT:    v_mov_b32_e32 v34, s10
+; GCN-NEXT:    v_mov_b32_e32 v35, s11
+; GCN-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index eefd7b5fea63e..3646d81ed435b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -141,18 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3]
+; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -179,18 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT:    v_mov_b32_e32 v4, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3]
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3]
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -198,18 +198,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; VGPRRC-NEXT:    v_mov_b32_e32 v4, 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3]
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3]
 ; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; VGPRRC-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
 ; AGPR:       ; %bb.0:
@@ -260,18 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; SDAG-NEXT:    v_mov_b32_e32 v8, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -298,18 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT:    v_mov_b32_e32 v4, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -317,18 +317,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; VGPRRC-NEXT:    v_mov_b32_e32 v4, 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
 ; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; VGPRRC-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
 ; AGPR:       ; %bb.0:
@@ -1506,26 +1506,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
 ; SDAG-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1609,26 +1609,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
 ; HEURRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 2
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -1666,26 +1666,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
 ; VGPRRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 2
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -1848,26 +1848,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
 ; SDAG-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1951,26 +1951,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
 ; HEURRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 2
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_nop 0
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -2008,26 +2008,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
 ; VGPRRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 2
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -5411,18 +5411,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
 ; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3]
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GCN-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5430,18 +5430,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT:    v_mov_b32_e32 v4, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3]
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3]
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5449,18 +5449,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; VGPRRC-NEXT:    v_mov_b32_e32 v4, 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3]
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3]
 ; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; VGPRRC-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
 ; AGPR:       ; %bb.0:
@@ -5511,18 +5511,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
 ; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GCN-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5530,18 +5530,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT:    v_mov_b32_e32 v4, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v8, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5549,18 +5549,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; VGPRRC-NEXT:    v_mov_b32_e32 v4, 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v8, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[4:7], v[4:7], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
 ; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; VGPRRC-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
 ; AGPR:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index da46ade4401f2..0c1448a0b8fb6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -245,41 +245,24 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16:
@@ -324,41 +307,24 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
@@ -403,41 +369,24 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
@@ -723,41 +672,24 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
 ; GCN-NEXT:    s_nop 11
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -767,41 +699,24 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; GCN-NEXT:    s_nop 11
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
@@ -811,41 +726,24 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; GCN-NEXT:    s_nop 11
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
@@ -1144,41 +1042,24 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8:
@@ -1223,41 +1104,24 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
@@ -1302,41 +1166,24 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
@@ -2202,41 +2049,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
@@ -2281,41 +2111,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
@@ -2360,41 +2173,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
@@ -2604,41 +2400,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
@@ -2683,41 +2462,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
@@ -2762,41 +2524,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
@@ -3006,41 +2751,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
@@ -3085,41 +2813,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
@@ -3164,41 +2875,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
@@ -3408,41 +3102,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
@@ -3487,41 +3164,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
@@ -3566,41 +3226,24 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 6383bfb65d364..8d12971b2d5c0 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -369,74 +369,73 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_mov_b32 s5, s4
-; CHECK-NEXT:    v_mov_b64_e32 v[26:27], s[4:5]
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, s4
+; CHECK-NEXT:    v_accvgpr_write_b32 a1, s5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[16:19]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; CHECK-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; CHECK-NEXT:    v_mov_b32_e32 v18, 0x7fc00000
+; CHECK-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; CHECK-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[0:3]
-; CHECK-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], a[0:1], a[0:1], v[2:5]
+; CHECK-NEXT:    v_accvgpr_write_b32 a3, s1
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, s0
+; CHECK-NEXT:    v_mov_b32_e32 v19, v18
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[10:13], a[0:1], a[0:1], v[6:9]
+; CHECK-NEXT:    v_mov_b32_e32 v20, v18
+; CHECK-NEXT:    v_mov_b32_e32 v21, v18
 ; CHECK-NEXT:    s_mov_b32 s0, 0x7e007e00
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], a[0:1], a[2:3], v[2:5]
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[26:27], v[26:27], v[4:7]
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[28:29], v[0:3]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[6:9]
-; CHECK-NEXT:    s_nop 3
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v24, v4
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[12:15], v[26:27], v[30:31], v[0:3]
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mov_b32_e32 v8, 0x7fc00000
-; CHECK-NEXT:    v_mov_b32_e32 v9, v8
-; CHECK-NEXT:    v_mov_b32_e32 v10, v8
-; CHECK-NEXT:    v_mov_b32_e32 v11, v8
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v6
+; CHECK-NEXT:    v_accvgpr_write_b32 a5, s1
+; CHECK-NEXT:    v_accvgpr_write_b32 a4, s0
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], a[0:1], a[0:1], v[6:9]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def v[6:9]
+; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_mov_b64_e32 v[0:1], 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11]
-; CHECK-NEXT:    global_store_short v[0:1], v2, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v30, v10
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], a[0:1], a[0:1], v[18:21]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], a[0:1], a[4:5], v[2:5]
+; CHECK-NEXT:    s_nop 2
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v11, v14
+; CHECK-NEXT:    global_store_short v[0:1], v11, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[12:15], a[0:1], a[0:1], v[6:9]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], a[0:1], a[2:3], v[6:9]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], a[0:1], a[0:1], v[16:19]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[26:29], a[0:1], a[0:1], v[2:5]
 ; CHECK-NEXT:    s_nop 5
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v10, v6
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15]
-; CHECK-NEXT:    global_store_short v[0:1], v10, off
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5]
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v31, v22
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], a[0:1], a[0:1], v[6:9]
+; CHECK-NEXT:    global_store_short v[0:1], v31, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[10:13], a[0:1], a[0:1], v[12:15]
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v31, v26
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    s_nop 1
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; CHECK-NEXT:    global_store_short v[0:1], v6, off
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23]
+; CHECK-NEXT:    global_store_short v[0:1], v31, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], a[2:3], a[0:1], v[2:5]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[0:1], v24, off
+; CHECK-NEXT:    global_store_short v[0:1], v30, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5]
-; CHECK-NEXT:    s_nop 6
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v2
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19]
-; CHECK-NEXT:    global_store_short v[0:1], v6, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v30, v2
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], a[4:5], a[0:1], v[10:13]
+; CHECK-NEXT:    global_store_short v[0:1], v30, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
 ; CHECK-NEXT:    s_nop 2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; CHECK-NEXT:    global_store_short v[0:1], v2, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v31, v2
+; CHECK-NEXT:    global_store_short v[0:1], v31, off
 ; CHECK-NEXT:    s_endpgm
 entry:
   %k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
@@ -607,6 +606,7 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add
 ; CHECK-NEXT:    v_mov_b64_e32 v[28:29], v[58:59]
 ; CHECK-NEXT:    v_mov_b64_e32 v[30:31], v[60:61]
 ; CHECK-NEXT:    v_mov_b64_e32 v[32:33], v[62:63]
+; CHECK-NEXT:    v_mov_b32_e32 v34, 0x41800000
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[30:33], s[0:1] offset:112
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[26:29], s[0:1] offset:96
@@ -616,10 +616,7 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[10:13], s[0:1] offset:32
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
-; CHECK-NEXT:    s_nop 1
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0x41800000
-; CHECK-NEXT:    s_nop 1
-; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; CHECK-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v34, a[0:31]
 ; CHECK-NEXT:    s_nop 15
 ; CHECK-NEXT:    s_nop 1
 ; CHECK-NEXT:    global_store_dwordx4 v0, a[24:27], s[2:3] offset:96



More information about the llvm-commits mailing list