[llvm] [AMDGPU] Improve register allocation to reduce MFMA hazard NOPs (PR #156943)

Syadus Sefat via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 23 08:49:40 PDT 2025


https://github.com/mssefat updated https://github.com/llvm/llvm-project/pull/156943

>From 83b034225bfa5ac897bcf4cbd6ab05ea7214fffd Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Thu, 4 Sep 2025 12:31:49 -0400
Subject: [PATCH 01/18] [AMDGPU] Improve register allocation to reduce MFMA
 hazard NOPs

rebased
---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   |   94 ++
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |   14 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |   32 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |    4 +-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir |  523 +++----
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir |  542 +++----
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll |  184 +--
 .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll    |   62 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll |  456 +++---
 ...amdgcn.mfma.hint.hazard.barrier.gfx942.mir | 1292 +++++++++++++++++
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  |  146 +-
 ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll |  159 +-
 .../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll    |   12 +-
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       |  231 +--
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       |  123 +-
 .../unspill-vgpr-after-rewrite-vgpr-mfma.ll   |   33 +-
 16 files changed, 2673 insertions(+), 1234 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9485e4d..6d2b10bdb5804 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -34,6 +34,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -43,6 +44,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
 
+static cl::opt<bool> EnableRegisterAvoidListForMFMARegs(
+    "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden,
+    cl::desc("Enable Register Avoidance for "
+             "MFMA in GCNPreRAOptimizations stage."),
+    cl::init(true));
+
 namespace {
 
 class GCNPreRAOptimizationsImpl {
@@ -248,6 +255,93 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
 
   bool Changed = false;
 
+  // Single pass implementation
+  if (EnableRegisterAvoidListForMFMARegs && ST.hasMAIInsts()) {
+    // Max lookback window for RAW or WAW hazard
+    constexpr unsigned MaxLookbackWindow = 19;
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    for (const MachineBasicBlock &MBB : MF) {
+
+      SmallVector<std::pair<SlotIndex, SmallVector<Register, 4>>, 16>
+          RecentMFMAs;
+      for (const MachineInstr &MI : MBB) {
+        if (MI.isDebugInstr())
+          continue;
+        const SlotIndex CurrentSlot = LIS->getInstructionIndex(MI).getRegSlot();
+        // Handle MFMA instructions
+        if (SIInstrInfo::isMFMA(MI)) {
+          SmallVector<Register, 4> MFMARegisters;
+          auto collectMFMARegister = [&](unsigned OpIdx) {
+            if (OpIdx >= MI.getNumOperands())
+              return;
+
+            const MachineOperand &MO = MI.getOperand(OpIdx);
+            if (MO.isReg() && MO.getReg().isVirtual())
+              MFMARegisters.push_back(MO.getReg());
+          };
+          // Only collect Matrix C (operand 3) and destination (operand 0)
+          // registers
+          collectMFMARegister(0);
+          collectMFMARegister(3);
+
+          if (!MFMARegisters.empty()) {
+            RecentMFMAs.emplace_back(CurrentSlot, std::move(MFMARegisters));
+            // Maintain window
+            if (RecentMFMAs.size() > MaxLookbackWindow)
+              RecentMFMAs.erase(RecentMFMAs.begin());
+          }
+          continue;
+        }
+        bool ShouldCheckReuse = MI.mayLoad() || MI.mayStore() || MI.isCopy() ||
+                                SIInstrInfo::isVALU(MI);
+        // Skip non-relevant instructions, or skip until at least one MFMA is
+        // encountered
+        if (!ShouldCheckReuse || RecentMFMAs.empty())
+          continue;
+
+        // Process operands that might reuse MFMA registers
+        for (const MachineOperand &MO : MI.operands()) {
+          if (!MO.isReg() || !MO.getReg().isVirtual())
+            continue;
+
+          const Register CandidateReg = MO.getReg();
+          const TargetRegisterClass *CandidateRC =
+              MRI->getRegClass(CandidateReg);
+
+          // Only process VGPR registers
+          if (!TRI->isVGPRClass(CandidateRC))
+            continue;
+
+          for (auto It = RecentMFMAs.rbegin(); It != RecentMFMAs.rend(); ++It) {
+            const SmallVector<Register, 4> &MFMARegs = It->second;
+            for (Register MFMAReg : MFMARegs) {
+              // Verify register class compatibility
+              const TargetRegisterClass *MFMARC = MRI->getRegClass(MFMAReg);
+              if (!TRI->hasVGPRs(MFMARC))
+                continue;
+
+              // Check if MFMA register is dead at current instruction
+              const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg);
+              if (!MFMAInterval.liveAt(CurrentSlot)) {
+
+                // Add bidirectional avoidance hint
+                MFI->addRegisterToAvoid(CandidateReg, MFMAReg);
+                MFI->addRegisterToAvoid(MFMAReg, CandidateReg);
+
+                // Set hint if we found registers to avoid
+                MRI->setRegAllocationHint(
+                    MFMAReg, AMDGPURI::HasRegisterAvoidanceList, Register());
+                MRI->setRegAllocationHint(CandidateReg,
+                                          AMDGPURI::HasRegisterAvoidanceList,
+                                          Register());
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
   for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
     if (!LIS->hasInterval(Reg))
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 45606153db58e..8df4c12b5a77d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1212,6 +1212,20 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
 
   AMDGPU::ClusterDimsAttr getClusterDims() const { return ClusterDims; }
+
+  // Map of registers to avoid for a given register
+  DenseMap<Register, SmallVector<Register, 8>> RegisterAvoidanceMap;
+
+  void addRegisterToAvoid(Register VirtReg, Register AvoidReg) {
+    RegisterAvoidanceMap[VirtReg].push_back(AvoidReg);
+  }
+
+  ArrayRef<Register> getRegistersToAvoid(Register VirtReg) const {
+    auto It = RegisterAvoidanceMap.find(VirtReg);
+    if (It != RegisterAvoidanceMap.end())
+      return It->second;
+    return ArrayRef<Register>();
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 205237fefe785..d5ac52997dc57 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3839,6 +3839,38 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
     }
     return false;
   }
+  case AMDGPURI::HasRegisterAvoidanceList: {
+    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    ArrayRef<Register> AvoidRegs = MFI->getRegistersToAvoid(VirtReg);
+
+    if (AvoidRegs.empty())
+      return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+                                                       MF, VRM);
+    // Collect physical registers to avoid
+    SmallSet<MCPhysReg, 32> AvoidPhysRegs;
+    for (Register AvoidReg : AvoidRegs) {
+      if (VRM && VRM->hasPhys(AvoidReg)) {
+        // Virtual register already mapped - try to avoid its physical register
+        MCPhysReg AvoidPhys = VRM->getPhys(AvoidReg);
+        for (MCRegAliasIterator AI(AvoidPhys, this, true); AI.isValid(); ++AI)
+          AvoidPhysRegs.insert(*AI);
+      }
+    }
+
+    if (AvoidPhysRegs.empty()) {
+      // No physical registers added yet - use default order
+      return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+                                                       MF, VRM);
+    }
+
+    // Prioritize registers that don't conflict with avoided registers
+    for (MCPhysReg PhysReg : Order) {
+      if (!AvoidPhysRegs.count(PhysReg) && !MRI.isReserved(PhysReg))
+        Hints.push_back(PhysReg);
+    }
+
+    return false;
+  }
   default:
     return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
                                                      VRM);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 7b91ba7bc581f..ed0c580abc952 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -31,9 +31,11 @@ class RegisterBank;
 struct SGPRSpillBuilder;
 
 /// Register allocation hint types. Helps eliminate unneeded COPY with True16
+/// HasRegisterAvoidanceList helps with minimizing usage of conflicting physical
+/// registers
 namespace AMDGPURI {
 
-enum { Size16 = 1, Size32 = 2 };
+enum { Size16 = 1, Size32 = 2, HasRegisterAvoidanceList = 3 };
 
 } // end namespace AMDGPURI
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index b07dec326327e..d4380fd41310a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -15,9 +15,12 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr106
   ; GCN-NEXT:    ; implicit-def: $vgpr132
+  ; GCN-NEXT:    ; implicit-def: $vgpr112
+  ; GCN-NEXT:    ; implicit-def: $vgpr113
+  ; GCN-NEXT:    ; implicit-def: $vgpr114
+  ; GCN-NEXT:    ; implicit-def: $vgpr115
   ; GCN-NEXT:    ; implicit-def: $vgpr133
   ; GCN-NEXT:    ; implicit-def: $vgpr139
-  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -167,46 +170,45 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
-  ; GCN-NEXT:    ; implicit-def: $vgpr64
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
-  ; GCN-NEXT:    ; implicit-def: $vgpr73
-  ; GCN-NEXT:    v_add_u32_e32 v76, v132, v64
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
+  ; GCN-NEXT:    v_add_u32_e32 v73, v132, v112
   ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; kill: killed $vgpr72
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v73
-  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v113
+  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v73, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr74
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v74
-  ; GCN-NEXT:    ; implicit-def: $vgpr75
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v114
   ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v75
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v115
   ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; kill: killed $vgpr73
   ; GCN-NEXT:    ds_read_b128 v[72:75], v94
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ; implicit-def: $sgpr8
+  ; GCN-NEXT:    ; implicit-def: $vgpr112
+  ; GCN-NEXT:    ; implicit-def: $vgpr113
+  ; GCN-NEXT:    ; implicit-def: $vgpr114
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
   ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
@@ -411,8 +413,6 @@
   ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
   ; GCN-NEXT:    ; implicit-def: $vgpr65
   ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    ; implicit-def: $vgpr68
-  ; GCN-NEXT:    ; implicit-def: $vgpr67
   ; GCN-NEXT:    v_add_u32_e32 v65, s7, v65
   ; GCN-NEXT:    v_and_b32_e32 v65, 0x1fffffff, v65
   ; GCN-NEXT:    v_mul_lo_u32 v65, v65, s6
@@ -440,40 +440,36 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v138, v[96:97]
-  ; GCN-NEXT:    v_add_u32_e32 v68, v132, v68
+  ; GCN-NEXT:    ; implicit-def: $vgpr96
   ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[6:7]
   ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
   ; GCN-NEXT:    ; implicit-def: $vgpr65
   ; GCN-NEXT:    v_max_f32_e32 v66, v65, v65
   ; GCN-NEXT:    v_max_f32_e32 v134, v66, v64
-  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v96
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[160:161], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v64
-  ; GCN-NEXT:    buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v112
+  ; GCN-NEXT:    buffer_load_dwordx2 v[162:163], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v66
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v113
   ; GCN-NEXT:    buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v67
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v114
   ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
   ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v134
-  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
   ; GCN-NEXT:    v_fma_f32 v64, s4, v49, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v163, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
   ; GCN-NEXT:    v_fma_f32 v66, s4, v50, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v164, v57
+  ; GCN-NEXT:    v_exp_f32_e32 v165, v57
   ; GCN-NEXT:    v_exp_f32_e32 v49, v48
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64
   ; GCN-NEXT:    v_fma_f32 v67, s4, v51, -v134
@@ -499,31 +495,30 @@
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v70
   ; GCN-NEXT:    v_exp_f32_e32 v55, v48
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v71
-  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_fma_f32 v66, s4, v56, -v134
   ; GCN-NEXT:    v_exp_f32_e32 v56, v48
   ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v134
+  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v49
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v67, v50
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v51
+  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v58, v52
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
   ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_exp_f32_e32 v48, v48
-  ; GCN-NEXT:    v_pack_b32_f16 v161, v68, v58
-  ; GCN-NEXT:    v_pack_b32_f16 v160, v64, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v58, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    v_fma_f32 v156, s4, v59, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v59, v68, v58
+  ; GCN-NEXT:    v_pack_b32_f16 v58, v64, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v80, 0x3fb8aa3b, v66
   ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ds_read_b128 v[152:155], v139 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v56
   ; GCN-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
@@ -532,9 +527,15 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
+  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GCN-NEXT:    v_fma_f32 v157, s4, v60, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[58:59], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v141, v80
   ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
-  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134
+  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
+<<<<<<< HEAD
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e64 v82, v82, v48
   ; GCN-NEXT:    v_mul_f32_e64 v83, v83, v48
@@ -542,10 +543,16 @@
   ; GCN-NEXT:    v_mul_f32_e64 v85, v85, v48
   ; GCN-NEXT:    v_mul_f32_e64 v86, v86, v48
   ; GCN-NEXT:    v_mul_f32_e64 v87, v87, v48
+=======
+  ; GCN-NEXT:    v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
+>>>>>>> ee1ade05012a ([AMDGPU] Improve register allocation to reduce MFMA hazard NOPs)
   ; GCN-NEXT:    v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
+<<<<<<< HEAD
   ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
   ; GCN-NEXT:    v_exp_f32_e32 v58, v58
   ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
@@ -556,13 +563,17 @@
   ; GCN-NEXT:    v_mul_f32_e64 v101, v101, v48
   ; GCN-NEXT:    v_mul_f32_e64 v102, v102, v48
   ; GCN-NEXT:    v_mul_f32_e64 v103, v103, v48
+=======
+  ; GCN-NEXT:    v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
+>>>>>>> ee1ade05012a ([AMDGPU] Improve register allocation to reduce MFMA hazard NOPs)
   ; GCN-NEXT:    v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v59
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53
+<<<<<<< HEAD
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54
   ; GCN-NEXT:    v_exp_f32_e32 v59, v57
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
@@ -571,249 +582,264 @@
   ; GCN-NEXT:    v_mul_f32_e64 v113, v113, v48
   ; GCN-NEXT:    v_mul_f32_e64 v114, v114, v48
   ; GCN-NEXT:    v_mul_f32_e64 v115, v115, v48
+=======
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[58:59], v[80:95]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v144, v54
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v145, v55
+  ; GCN-NEXT:    v_exp_f32_e32 v167, v57
+  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+  ; GCN-NEXT:    v_mul_f32_e32 v168, 0x3fb8aa3b, v157
+  ; GCN-NEXT:    v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
+>>>>>>> ee1ade05012a ([AMDGPU] Improve register allocation to reduce MFMA hazard NOPs)
   ; GCN-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[58:59], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v148, v56
   ; GCN-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_fma_f32 v148, s4, v62, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v149, v145, v148
+  ; GCN-NEXT:    v_pack_b32_f16 v148, v140, v144
+  ; GCN-NEXT:    v_mul_f32_e32 v140, 0x3fb8aa3b, v156
+  ; GCN-NEXT:    v_exp_f32_e32 v168, v168
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[58:59], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v153, v140
+  ; GCN-NEXT:    ; implicit-def: $vgpr140
+  ; GCN-NEXT:    v_fma_f32 v164, s4, v61, -v134
+  ; GCN-NEXT:    v_fma_f32 v166, s4, v62, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v169, v141
   ; GCN-NEXT:    v_fma_f32 v152, s4, v63, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v60
-  ; GCN-NEXT:    ; implicit-def: $vgpr57
-  ; GCN-NEXT:    ds_read_b128 v[60:63], v57
+  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v35, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[148:149], v[64:79]
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v140
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v160, v149
-  ; GCN-NEXT:    v_fma_f32 v161, s4, v33, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v148
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
-  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
-  ; GCN-NEXT:    ds_read_b128 v[140:143], v57 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[156:159], v140 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_fma_f32 v40, s4, v40, -v134
   ; GCN-NEXT:    v_fma_f32 v44, s4, v44, -v134
   ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v134
-  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134
   ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
-  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v162
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v163
-  ; GCN-NEXT:    v_exp_f32_e32 v162, v146
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v164
   ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v148, v153, v147
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[148:149], v[80:95]
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v164
+  ; GCN-NEXT:    v_fma_f32 v164, s4, v33, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v166
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v165
+  ; GCN-NEXT:    v_exp_f32_e32 v170, v146
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v167
   ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[148:149], v[96:111]
   ; GCN-NEXT:    v_exp_f32_e32 v151, v33
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v59
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v153
+  ; GCN-NEXT:    v_pack_b32_f16 v62, v169, v147
   ; GCN-NEXT:    v_fma_f32 v150, s4, v34, -v134
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v149, v146, v33
+  ; GCN-NEXT:    v_perm_b32 v147, v131, v129, s8
+  ; GCN-NEXT:    v_pack_b32_f16 v63, v146, v33
   ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v152
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
-  ; GCN-NEXT:    v_fma_f32 v152, s4, v35, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v153, v33
-  ; GCN-NEXT:    v_fma_f32 v155, s4, v36, -v134
-  ; GCN-NEXT:    v_perm_b32 v36, v158, v156, s5
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v154, v160
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v60, 0x3fb8aa3b, v32
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[144:147], v57 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v61, 0x3fb8aa3b, v161
-  ; GCN-NEXT:    v_exp_f32_e32 v165, v60
-  ; GCN-NEXT:    v_perm_b32 v60, v158, v156, s8
-  ; GCN-NEXT:    v_fma_f32 v158, s4, v37, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v161, v61
-  ; GCN-NEXT:    v_perm_b32 v140, v159, v157, s8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[148:149], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v148, v33
+  ; GCN-NEXT:    v_fma_f32 v152, s4, v36, -v134
+  ; GCN-NEXT:    v_perm_b32 v36, v162, v160, s5
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v149, v168
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v155, v170
+  ; GCN-NEXT:    v_perm_b32 v146, v163, v161, s8
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[62:63], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v140 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[58:61], v140 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v164
+  ; GCN-NEXT:    v_exp_f32_e32 v154, v142
+  ; GCN-NEXT:    v_perm_b32 v142, v162, v160, s8
+  ; GCN-NEXT:    v_fma_f32 v160, s4, v38, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[156:157], v[62:63], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v157, v143
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v148
+  ; GCN-NEXT:    v_fma_f32 v156, s4, v37, -v134
   ; GCN-NEXT:    v_perm_b32 v37, v130, v128, s5
-  ; GCN-NEXT:    v_perm_b32 v61, v130, v128, s8
-  ; GCN-NEXT:    v_perm_b32 v141, v131, v129, s8
+  ; GCN-NEXT:    v_perm_b32 v143, v130, v128, s8
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b64 v135, v[36:37]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v32, v159, v157, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[62:63], v[96:111]
   ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v150
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v150, v151
-  ; GCN-NEXT:    v_fma_f32 v157, s4, v38, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v153
-  ; GCN-NEXT:    v_exp_f32_e32 v159, v33
+  ; GCN-NEXT:    v_perm_b32 v32, v163, v161, s5
+  ; GCN-NEXT:    v_exp_f32_e32 v161, v33
   ; GCN-NEXT:    v_perm_b32 v33, v131, v129, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v129, v150, v38
-  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v152
-  ; GCN-NEXT:    v_exp_f32_e32 v152, v38
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[60:61]
+  ; GCN-NEXT:    ds_write_b64 v136, v[142:143]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v137, v[32:33]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[58:59], v[62:63], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v59, v150, v38
+  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_pack_b32_f16 v58, v149, v155
+  ; GCN-NEXT:    v_exp_f32_e32 v149, v38
   ; GCN-NEXT:    ; implicit-def: $vgpr33
   ; GCN-NEXT:    ; implicit-def: $vgpr38
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v138, v[140:141]
+  ; GCN-NEXT:    ds_write_b64 v138, v[146:147]
   ; GCN-NEXT:    v_add_u32_e32 v38, v132, v38
   ; GCN-NEXT:    v_add_u32_e32 v33, v132, v33
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[62:63], v38, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[142:143], v33, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; implicit-def: $vgpr36
   ; GCN-NEXT:    v_add_u32_e32 v33, v132, v36
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[144:145], v[58:59], v[64:79]
   ; GCN-NEXT:    ; implicit-def: $vgpr37
   ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_add_u32_e32 v33, v132, v37
-  ; GCN-NEXT:    buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[146:147], v33, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v156, v162
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v152
+  ; GCN-NEXT:    v_exp_f32_e32 v150, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v165
-  ; GCN-NEXT:    v_pack_b32_f16 v128, v154, v156
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v39, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[158:159], v[58:59], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v160
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v154
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v152, v157
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v39, -v134
   ; GCN-NEXT:    ds_read_b128 v[36:39], v139
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
-  ; GCN-NEXT:    v_exp_f32_e32 v154, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
-  ; GCN-NEXT:    ds_read_b128 v[60:63], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v156, s4, v42, -v134
-  ; GCN-NEXT:    v_perm_b32 v20, v140, v130, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v155, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v157
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v142, v161
-  ; GCN-NEXT:    v_fma_f32 v143, s4, v41, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v159
-  ; GCN-NEXT:    v_exp_f32_e32 v157, v32
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v152
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v129, v34, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_pack_b32_f16 v128, v33, v142
-  ; GCN-NEXT:    v_exp_f32_e32 v146, v32
+  ; GCN-NEXT:    ds_read_b128 v[128:131], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[58:59], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v161
+  ; GCN-NEXT:    v_exp_f32_e32 v159, v32
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v149
+  ; GCN-NEXT:    v_fma_f32 v155, s4, v41, -v134
+  ; GCN-NEXT:    v_fma_f32 v158, s4, v42, -v134
+  ; GCN-NEXT:    v_fma_f32 v162, s4, v20, -v134
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[60:61], v[58:59], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v59, v34, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_pack_b32_f16 v58, v33, v152
+  ; GCN-NEXT:    v_exp_f32_e32 v60, v32
   ; GCN-NEXT:    ds_read_b128 v[32:35], v139 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v142, s4, v43, -v134
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v46, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v43, -v134
+  ; GCN-NEXT:    v_perm_b32 v20, v142, v62, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[58:59], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v40
   ; GCN-NEXT:    ds_read_b128 v[40:43], v139 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v147, v36
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v143
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v154
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v143, v36
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v155
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v142
-  ; GCN-NEXT:    v_fma_f32 v61, s4, v45, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v157
-  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v146
+  ; GCN-NEXT:    v_exp_f32_e32 v61, v36
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v150
+  ; GCN-NEXT:    v_fma_f32 v155, s4, v46, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[128:129], v[58:59], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v152, v36
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v128, v156
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_fma_f32 v129, s4, v45, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[58:59], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v159
+  ; GCN-NEXT:    v_exp_f32_e32 v158, v32
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v60
   ; GCN-NEXT:    v_pack_b32_f16 v33, v33, v32
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v60
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v129, v36
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[58:59], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v57, v36
   ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v44
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v147
-  ; GCN-NEXT:    v_fma_f32 v128, s4, v47, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v59, v61
+  ; GCN-NEXT:    v_fma_f32 v58, s4, v47, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v57
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v140
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v142, v40
-  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v61
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v143
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v57 offset:576
+  ; GCN-NEXT:    v_exp_f32_e32 v128, v40
+  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v129
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v129, v152
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v140 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
-  ; GCN-NEXT:    v_fma_f32 v62, s4, v17, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_exp_f32_e32 v63, v40
-  ; GCN-NEXT:    v_pack_b32_f16 v40, v60, v61
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v18, -v134
-  ; GCN-NEXT:    v_fma_f32 v60, s4, v19, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v142
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[130:131], v[32:33], v[80:95]
+  ; GCN-NEXT:    v_fma_f32 v130, s4, v17, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    v_exp_f32_e32 v131, v40
+  ; GCN-NEXT:    v_pack_b32_f16 v40, v59, v129
+  ; GCN-NEXT:    v_fma_f32 v155, s4, v18, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v59, v128
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v156
-  ; GCN-NEXT:    v_exp_f32_e32 v158, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v129
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v158
+  ; GCN-NEXT:    v_exp_f32_e32 v160, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v57
   ; GCN-NEXT:    v_pack_b32_f16 v41, v34, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v128
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v58
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v128, v17
-  ; GCN-NEXT:    v_perm_b32 v42, v141, v131, s8
-  ; GCN-NEXT:    v_perm_b32 v43, v149, v145, s8
+  ; GCN-NEXT:    v_fma_f32 v58, s4, v19, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v129, v17
+  ; GCN-NEXT:    v_perm_b32 v42, v143, v63, s8
+  ; GCN-NEXT:    v_perm_b32 v43, v147, v145, s8
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v16
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v140 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v140 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v62
-  ; GCN-NEXT:    v_exp_f32_e32 v167, v36
-  ; GCN-NEXT:    v_perm_b32 v36, v140, v130, s8
+  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v130
+  ; GCN-NEXT:    v_exp_f32_e32 v163, v36
+  ; GCN-NEXT:    v_perm_b32 v36, v142, v62, s8
   ; GCN-NEXT:    v_fma_f32 v62, s4, v21, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
   ; GCN-NEXT:    v_exp_f32_e32 v130, v37
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v158
-  ; GCN-NEXT:    v_perm_b32 v21, v148, v144, s5
-  ; GCN-NEXT:    v_perm_b32 v37, v148, v144, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v160
+  ; GCN-NEXT:    v_perm_b32 v21, v146, v144, s5
+  ; GCN-NEXT:    v_perm_b32 v37, v146, v144, s8
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v131
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b64 v135, v[20:21]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v16, v141, v131, s5
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v22, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v128
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_exp_f32_e32 v140, v17
-  ; GCN-NEXT:    v_perm_b32 v17, v149, v145, s5
+  ; GCN-NEXT:    v_perm_b32 v16, v143, v63, s5
+  ; GCN-NEXT:    v_fma_f32 v63, s4, v22, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    v_exp_f32_e32 v142, v17
+  ; GCN-NEXT:    v_perm_b32 v17, v147, v145, s5
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v136, v[36:37]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
   ; GCN-NEXT:    v_pack_b32_f16 v33, v45, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v60
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v58
   ; GCN-NEXT:    v_exp_f32_e32 v144, v22
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -836,22 +862,22 @@
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_add_u32_e32 v20, v132, v20
   ; GCN-NEXT:    v_add_u32_e32 v21, v132, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v59, v44
   ; GCN-NEXT:    buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[58:59], v21, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v166
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v162
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
   ; GCN-NEXT:    v_exp_f32_e32 v132, v16
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v62
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v167
-  ; GCN-NEXT:    v_fma_f32 v141, s4, v23, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v163
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v23, -v134
   ; GCN-NEXT:    ds_read_b128 v[20:23], v139
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -860,20 +886,20 @@
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
   ; GCN-NEXT:    v_exp_f32_e32 v62, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v63
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v46, v130
   ; GCN-NEXT:    v_fma_f32 v47, s4, v25, -v134
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v26, -v134
-  ; GCN-NEXT:    v_fma_f32 v149, s4, v4, -v134
+  ; GCN-NEXT:    v_fma_f32 v63, s4, v26, -v134
+  ; GCN-NEXT:    v_fma_f32 v147, s4, v4, -v134
   ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    v_perm_b32 v4, v42, v40, s5
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v140
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v142
   ; GCN-NEXT:    v_exp_f32_e32 v145, v16
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v144
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
   ; GCN-NEXT:    v_pack_b32_f16 v33, v18, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v141
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v143
   ; GCN-NEXT:    v_pack_b32_f16 v32, v17, v46
   ; GCN-NEXT:    v_exp_f32_e32 v35, v16
   ; GCN-NEXT:    ds_read_b128 v[16:19], v139 offset:1152
@@ -895,11 +921,11 @@
   ; GCN-NEXT:    v_fma_f32 v37, s4, v29, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v46
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v63
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v145
-  ; GCN-NEXT:    v_exp_f32_e32 v141, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v143, v16
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v35
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v30, -v134
+  ; GCN-NEXT:    v_fma_f32 v63, s4, v30, -v134
   ; GCN-NEXT:    v_pack_b32_f16 v17, v17, v16
   ; GCN-NEXT:    v_pack_b32_f16 v16, v21, v36
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
@@ -907,25 +933,25 @@
   ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v28
   ; GCN-NEXT:    v_fma_f32 v32, s4, v31, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[20:23], v57
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v140
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_exp_f32_e32 v36, v24
   ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v37
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v47
-  ; GCN-NEXT:    ds_read_b128 v[28:31], v57 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[28:31], v140 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
   ; GCN-NEXT:    v_fma_f32 v38, s4, v1, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v63
   ; GCN-NEXT:    v_exp_f32_e32 v39, v24
   ; GCN-NEXT:    v_pack_b32_f16 v24, v34, v37
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v2, -v134
+  ; GCN-NEXT:    v_fma_f32 v63, s4, v2, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v36
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v141
-  ; GCN-NEXT:    v_exp_f32_e32 v148, v1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v143
+  ; GCN-NEXT:    v_exp_f32_e32 v146, v1
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v33
   ; GCN-NEXT:    v_pack_b32_f16 v25, v18, v1
   ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v32
@@ -933,25 +959,25 @@
   ; GCN-NEXT:    v_fma_f32 v32, s4, v3, -v134
   ; GCN-NEXT:    v_exp_f32_e32 v34, v1
   ; GCN-NEXT:    v_perm_b32 v26, v43, v41, s8
-  ; GCN-NEXT:    v_perm_b32 v27, v61, v45, s8
+  ; GCN-NEXT:    v_perm_b32 v27, v59, v45, s8
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v140 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v140 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v38
-  ; GCN-NEXT:    v_exp_f32_e32 v150, v20
+  ; GCN-NEXT:    v_exp_f32_e32 v155, v20
   ; GCN-NEXT:    v_perm_b32 v20, v42, v40, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v148
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v146
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
   ; GCN-NEXT:    v_exp_f32_e32 v38, v21
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v39
   ; GCN-NEXT:    v_fma_f32 v29, s4, v5, -v134
-  ; GCN-NEXT:    v_perm_b32 v5, v60, v44, s5
-  ; GCN-NEXT:    v_perm_b32 v21, v60, v44, s8
+  ; GCN-NEXT:    v_perm_b32 v5, v58, v44, s5
+  ; GCN-NEXT:    v_perm_b32 v21, v58, v44, s8
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
@@ -961,9 +987,9 @@
   ; GCN-NEXT:    v_perm_b32 v0, v43, v41, s5
   ; GCN-NEXT:    v_fma_f32 v41, s4, v6, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v34
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v63
   ; GCN-NEXT:    v_exp_f32_e32 v42, v1
-  ; GCN-NEXT:    v_perm_b32 v1, v61, v45, s5
+  ; GCN-NEXT:    v_perm_b32 v1, v59, v45, s5
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v136, v[20:21]
@@ -987,10 +1013,10 @@
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v147
   ; GCN-NEXT:    v_exp_f32_e32 v26, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v29
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v150
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v155
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v38
   ; GCN-NEXT:    ds_read_b128 v[20:23], v139 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1042,10 +1068,10 @@
   ; GCN-NEXT:    v_exp_f32_e32 v21, v9
   ; GCN-NEXT:    v_fma_f32 v8, s4, v15, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v57
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v140
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[12:15], v57 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[12:15], v140 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24
@@ -1071,33 +1097,33 @@
   ; GCN-NEXT:    v_add_f32_e32 v3, v54, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v55, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v56, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v58, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v164, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v59, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v162, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v151, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v165, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v161, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v159, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v152, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v168, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v170, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v151, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v154, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v157, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v147, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v161, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v149, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v156, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v63, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v159, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v60, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v61, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v152, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v158, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v57, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v128, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v131, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v130, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v140, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v144, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v132, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v62, v3
@@ -1105,14 +1131,14 @@
   ; GCN-NEXT:    v_add_f32_e32 v3, v35, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v46, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v47, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v33, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v36, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v39, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
   ; GCN-NEXT:    v_add_f32_e32 v3, v34, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2
   ; GCN-NEXT:    v_add_f32_e32 v3, v38, v3
@@ -1137,17 +1163,18 @@
   ; GCN-NEXT:    v_add_f32_e32 v4, v10, v0
   ; GCN-NEXT:    ds_bpermute_b32 v5, v133, v4
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v140 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_add_f32_e32 v2, v4, v5
   ; GCN-NEXT:    ds_bpermute_b32 v3, v133, v2
+  ; GCN-NEXT:    ; implicit-def: $vgpr4
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111]
+  ; GCN-NEXT:    v_mov_b32_e32 v0, v4
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[6:7]
-  ; GCN-NEXT:    ; implicit-def: $vgpr4
-  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v48
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1728
+  ; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v2, s[6:7]
+  ; GCN-NEXT:    v_fmac_f32_e32 v1, v0, v48
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v140 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0887fdf0844b0..be97a1e82fcf2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -10,25 +10,24 @@
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
   ; GCN-NEXT:    v_readfirstlane_b32 s20, v2
   ; GCN-NEXT:    ; implicit-def: $sgpr4
-  ; GCN-NEXT:    ; implicit-def: $vgpr3
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
   ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:    ; implicit-def: $vgpr50
+  ; GCN-NEXT:    ; implicit-def: $vgpr76
   ; GCN-NEXT:    ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; implicit-def: $vgpr49
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
-  ; GCN-NEXT:    ; implicit-def: $vgpr51
-  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
-  ; GCN-NEXT:    ; implicit-def: $vgpr76
+  ; GCN-NEXT:    ; implicit-def: $vgpr50
   ; GCN-NEXT:    ; implicit-def: $vgpr77
   ; GCN-NEXT:    ; implicit-def: $vgpr78
   ; GCN-NEXT:    ; implicit-def: $vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr80
-  ; GCN-NEXT:    ; implicit-def: $vgpr91
+  ; GCN-NEXT:    ; implicit-def: $vgpr81
+  ; GCN-NEXT:    ; implicit-def: $vgpr103
   ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v3
+  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v64
   ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
   ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -36,8 +35,9 @@
   ; GCN-NEXT:    s_lshl_b32 s4, s20, 7
   ; GCN-NEXT:    ; implicit-def: $vgpr5
   ; GCN-NEXT:    v_add_lshl_u32 v48, v5, s4, 1
-  ; GCN-NEXT:    v_add_u32_e32 v76, s20, v76
-  ; GCN-NEXT:    v_and_b32_e32 v76, 0x1fffffff, v76
+  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
+  ; GCN-NEXT:    v_add_u32_e32 v77, s20, v77
+  ; GCN-NEXT:    v_and_b32_e32 v77, 0x1fffffff, v77
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b128 v48, v[0:3]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -48,8 +48,8 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   ; GCN-NEXT:    ; implicit-def: $sgpr6
-  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v50
-  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v50
+  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v76
+  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v76
   ; GCN-NEXT:    buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -68,22 +68,22 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
   ; GCN-NEXT:    ; kill: killed $vgpr1
   ; GCN-NEXT:    ; kill: killed $vgpr0
-  ; GCN-NEXT:    v_mul_lo_u32 v76, v76, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v76, v77, v76, 1
-  ; GCN-NEXT:    v_lshl_add_u32 v77, v78, 1, v76
-  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    v_mul_lo_u32 v77, v77, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v77, v78, v77, 1
   ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
   ; GCN-NEXT:    ; implicit-def: $sgpr2
   ; GCN-NEXT:    ; implicit-def: $sgpr3
-  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
+  ; GCN-NEXT:    v_lshl_add_u32 v80, v81, 1, v79
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v51
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v50
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v51 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v50 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
@@ -107,20 +107,20 @@
   ; GCN-NEXT:    ds_read_b128 v[40:43], v49 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v51
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v50
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
   ; GCN-NEXT:    ; implicit-def: $vgpr32
   ; GCN-NEXT:    ; implicit-def: $vgpr33
-  ; GCN-NEXT:    v_add_u32_e32 v82, v32, v50
-  ; GCN-NEXT:    v_add_u32_e32 v83, v33, v50
-  ; GCN-NEXT:    ; kill: killed $vgpr82
+  ; GCN-NEXT:    v_add_u32_e32 v83, v32, v76
+  ; GCN-NEXT:    v_add_u32_e32 v76, v33, v76
   ; GCN-NEXT:    ; kill: killed $vgpr83
+  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[66:69], v51 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[66:69], v50 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
@@ -131,20 +131,20 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
   ; GCN-NEXT:    ; implicit-def: $vgpr66
   ; GCN-NEXT:    ; implicit-def: $vgpr67
-  ; GCN-NEXT:    v_max_f32_e32 v81, v67, v67
+  ; GCN-NEXT:    v_max_f32_e32 v82, v67, v67
   ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
   ; GCN-NEXT:    v_perm_b32 v70, v74, v72, s2
   ; GCN-NEXT:    v_perm_b32 v71, v74, v72, s3
   ; GCN-NEXT:    v_perm_b32 v72, v75, v73, s2
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v76, v70
+  ; GCN-NEXT:    ds_write_b32 v77, v70
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v77, v71
+  ; GCN-NEXT:    ds_write_b32 v78, v71
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v72
+  ; GCN-NEXT:    ds_write_b32 v79, v72
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v20
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
   ; GCN-NEXT:    v_mul_f32_e32 v64, s4, v16
@@ -152,11 +152,11 @@
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
   ; GCN-NEXT:    v_max3_f32 v64, v64, s5, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v21
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v22
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v23
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v24
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v25
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
@@ -166,12 +166,12 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v28
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v29
   ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v30
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v31
   ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v0
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v1
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v80, v84
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v81, v84
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v2
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v3
   ; GCN-NEXT:    v_max3_f32 v64, v64, v85, v86
@@ -179,315 +179,315 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v5
   ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v65
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v7
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v7
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v8
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v9
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v10
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v11
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v12
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v13
   ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
-  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
   ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v68
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
   ; GCN-NEXT:    v_perm_b32 v68, v75, v73, s3
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v68
-  ; GCN-NEXT:    ; implicit-def: $vgpr84
-  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
-  ; GCN-NEXT:    v_max_f32_e32 v70, v64, v65
+  ; GCN-NEXT:    ds_write_b32 v80, v68
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[70:71], v76, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_bpermute_b32 v71, v66, v70
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; implicit-def: $vgpr87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v70, v71, v70, s[0:1]
-  ; GCN-NEXT:    v_max_f32_e32 v70, v70, v70
-  ; GCN-NEXT:    v_max_f32_e32 v72, v81, v70
-  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v72
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v72
-  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v72
+  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[0:1]
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
+  ; GCN-NEXT:    v_max_f32_e32 v65, v82, v64
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v65
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v65
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v65
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v65
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
-  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v72
-  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v72
-  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v72
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v73, v16
-  ; GCN-NEXT:    v_exp_f32_e32 v74, v18
-  ; GCN-NEXT:    v_exp_f32_e32 v75, v19
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v65
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v65
+  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v65
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v17
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v19
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
   ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v73
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v81, v21
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v74
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v82, v22
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v75
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_sub_f32_e32 v24, v67, v72
-  ; GCN-NEXT:    v_exp_f32_e32 v83, v23
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v72
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v24, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v73
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v25, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v81
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v65
   ; GCN-NEXT:    v_exp_f32_e32 v85, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
-  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
-  ; GCN-NEXT:    ds_read_b128 v[18:21], v84
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v82
+  ; GCN-NEXT:    v_pack_b32_f16 v24, v16, v18
+  ; GCN-NEXT:    v_sub_f32_e32 v22, v67, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+  ; GCN-NEXT:    v_pack_b32_f16 v25, v20, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v19
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v16, v24
-  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v86, v23
+  ; GCN-NEXT:    v_exp_f32_e32 v64, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[16:17], v[24:25], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v16, 0, v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v83
+  ; GCN-NEXT:    v_fma_f32 v88, s4, v28, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v84
+  ; GCN-NEXT:    v_fma_f32 v91, s4, v29, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v21
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v87 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
-  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
-  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
-  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v17, v18
-  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v30, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v30, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v31, -v72
-  ; GCN-NEXT:    v_perm_b32 v31, v68, v64, s2
-  ; GCN-NEXT:    v_perm_b32 v64, v68, v64, s3
-  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
-  ; GCN-NEXT:    ds_read_b128 v[26:29], v91
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_perm_b32 v99, v70, v68, s2
+  ; GCN-NEXT:    v_perm_b32 v100, v70, v68, s3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[24:25], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v93, v73, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v94, v85
+  ; GCN-NEXT:    v_fma_f32 v95, s4, v30, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v96, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v97, v86
+  ; GCN-NEXT:    v_fma_f32 v98, s4, v31, -v65
+  ; GCN-NEXT:    v_perm_b32 v101, v71, v69, s2
+  ; GCN-NEXT:    v_perm_b32 v102, v71, v69, s3
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v91 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v76, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v31, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_exp_f32_e32 v31, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_pack_b32_f16 v18, v19, v86
-  ; GCN-NEXT:    v_pack_b32_f16 v19, v22, v89
+  ; GCN-NEXT:    ds_write_b32 v77, v99
+  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v76, v76, v90
+  ; GCN-NEXT:    v_pack_b32_f16 v77, v94, v97
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v77, v64
+  ; GCN-NEXT:    ds_write_b32 v78, v100
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v90
+  ; GCN-NEXT:    ds_write_b32 v79, v101
+  ; GCN-NEXT:    v_mul_f32_e32 v78, 0x3fb8aa3b, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v79, 0x3fb8aa3b, v91
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v81, v81, v93
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v89
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v91, v78
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v78, v92
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v93, v79
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[76:77], v[32:47]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v73
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v87
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v74, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v85
-  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v22, v64
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v88
-  ; GCN-NEXT:    v_exp_f32_e32 v64, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v75, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v30
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v23, v23
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v31
+  ; GCN-NEXT:    ds_write_b32 v80, v102
+  ; GCN-NEXT:    v_mul_f32_e32 v80, 0x3fb8aa3b, v95
+  ; GCN-NEXT:    v_add_f32_e32 v76, v82, v81
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v96
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v80
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v79, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v98
+  ; GCN-NEXT:    v_fma_f32 v81, s4, v3, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v88
   ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v20, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v18, v19
-  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v80, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v22
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v4, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v27, v3
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v64
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v5, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v90, v78
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v77, v79
   ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-  ; GCN-NEXT:    v_add_f32_e32 v17, v81, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v23
-  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v68, v2
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v25
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v83, v76
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v91
+  ; GCN-NEXT:    v_fma_f32 v83, s4, v4, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v90, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v93
+  ; GCN-NEXT:    v_fma_f32 v94, s4, v5, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v88
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v84, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v80
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v82
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v69, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v81
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v84
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v18, v4
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v19
-  ; GCN-NEXT:    v_exp_f32_e32 v24, v24
-  ; GCN-NEXT:    ds_read_b128 v[18:21], v84 offset:576
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v73
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v69
+  ; GCN-NEXT:    ds_read_b128 v[76:79], v87 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v26, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v82, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v27
-  ; GCN-NEXT:    v_exp_f32_e32 v26, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v65
-  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v83
+  ; GCN-NEXT:    v_mul_f32_e32 v81, 0x3fb8aa3b, v94
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[4:5], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v85, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v70, v90
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v71, v69
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v88
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v81
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[4:5], v[32:47]
   ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v83, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v68
-  ; GCN-NEXT:    v_exp_f32_e32 v6, v6
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v24
+  ; GCN-NEXT:    v_add_f32_e32 v68, v86, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v72
+  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v6
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v73
   ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v7
-  ; GCN-NEXT:    v_exp_f32_e32 v7, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v28, v29
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v69
-  ; GCN-NEXT:    ; implicit-def: $sgpr2
-  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_fma_f32 v75, s4, v11, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v70, v69
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v9
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v85, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v4, v88, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v89, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v71
+  ; GCN-NEXT:    v_fma_f32 v70, s4, v12, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v81
+  ; GCN-NEXT:    v_fma_f32 v86, s4, v13, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v87, v8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v76, v92, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v6
-  ; GCN-NEXT:    v_exp_f32_e32 v10, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v17, v28
-  ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v30, v4
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v31, v2
-  ; GCN-NEXT:    v_add_f32_e32 v0, v22, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v64, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v23, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v25, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v27, v0
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
-  ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
-  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
-  ; GCN-NEXT:    v_add_f32_e32 v0, v68, v0
-  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
-  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
-  ; GCN-NEXT:    v_add_f32_e32 v0, v24, v0
-  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
-  ; GCN-NEXT:    v_add_f32_e32 v0, v26, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v67, v0
-  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v5
-  ; GCN-NEXT:    v_add_f32_e32 v0, v6, v0
-  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
-  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v1
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v14
-  ; GCN-NEXT:    v_add_f32_e32 v0, v7, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
-  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
-  ; GCN-NEXT:    v_exp_f32_e32 v18, v1
-  ; GCN-NEXT:    v_add_f32_e32 v6, v8, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v91
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v74
+  ; GCN-NEXT:    v_fma_f32 v77, s4, v14, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v92, v83
+  ; GCN-NEXT:    v_pack_b32_f16 v68, v68, v85
+  ; GCN-NEXT:    v_mul_f32_e32 v75, 0x3fb8aa3b, v75
+  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v70
+  ; GCN-NEXT:    v_pack_b32_f16 v69, v69, v92
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v15, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v75
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v76, v96, v76
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v84
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v86
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v87
+  ; GCN-NEXT:    v_exp_f32_e32 v94, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[68:69], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v67, v67, v76
+  ; GCN-NEXT:    v_add_f32_e32 v67, v91, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v93, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v80, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v82, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v90, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v88, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v72, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v68, 0x3fb8aa3b, v77
+  ; GCN-NEXT:    v_add_f32_e32 v67, v73, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v89
+  ; GCN-NEXT:    v_exp_f32_e32 v78, v68
+  ; GCN-NEXT:    v_add_f32_e32 v67, v71, v67
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
-  ; GCN-NEXT:    v_add_f32_e32 v6, v9, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v13
-  ; GCN-NEXT:    v_add_f32_e32 v6, v10, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v15
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v12
-  ; GCN-NEXT:    v_add_f32_e32 v6, v11, v6
-  ; GCN-NEXT:    v_add_f32_e32 v6, v12, v6
-  ; GCN-NEXT:    v_add_f32_e32 v1, v15, v6
-  ; GCN-NEXT:    v_add_f32_e32 v11, v17, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v0, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v4, v10
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v91 offset:576
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v75
+  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
+  ; GCN-NEXT:    v_add_f32_e32 v67, v81, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v74, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v77, v76, v77
+  ; GCN-NEXT:    v_pack_b32_f16 v76, v85, v86
+  ; GCN-NEXT:    v_add_f32_e32 v67, v83, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v72, v65
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v94
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v78
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v74, v92
+  ; GCN-NEXT:    v_add_f32_e32 v67, v84, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v87, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v89, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v75, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v69, v68, v72
+  ; GCN-NEXT:    v_pack_b32_f16 v68, v74, v73
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v67, v92, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v94, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v78, v67
+  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
+  ; GCN-NEXT:    ds_bpermute_b32 v67, v66, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
+  ; GCN-NEXT:    ds_bpermute_b32 v66, v66, v65
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v18, v11
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    v_mov_b32_e32 v67, 0
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
-  ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
+  ; GCN-NEXT:    v_cndmask_b32_e64 v65, v66, v65, s[0:1]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_fmac_f32_e32 v65, v67, v64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[68:69], v[32:47]
   ; GCN-NEXT:    s_endpgm
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 5ab8706f28f5f..c48f3ee00130a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -427,37 +427,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
 ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 4
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -647,10 +647,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -665,19 +665,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1298,26 +1298,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
@@ -1326,26 +1326,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0)
@@ -1627,26 +1627,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
@@ -1655,26 +1655,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
@@ -1741,26 +1741,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
@@ -1769,26 +1769,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 033a35f69a0bd..e11050ccce746 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -269,28 +269,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; GCN-NEXT:    v_mov_b32_e32 v42, s22
 ; GCN-NEXT:    v_mov_b32_e32 v43, s23
+; GCN-NEXT:    v_mov_b32_e32 v32, s16
+; GCN-NEXT:    v_mov_b32_e32 v33, s17
+; GCN-NEXT:    v_mov_b32_e32 v34, s18
+; GCN-NEXT:    v_mov_b32_e32 v35, s19
 ; GCN-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    v_mov_b32_e32 v16, s16
-; GCN-NEXT:    v_mov_b32_e32 v17, s17
-; GCN-NEXT:    v_mov_b32_e32 v18, s18
-; GCN-NEXT:    v_mov_b32_e32 v19, s19
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-NEXT:    v_mov_b32_e32 v17, s13
-; GCN-NEXT:    v_mov_b32_e32 v18, s14
-; GCN-NEXT:    v_mov_b32_e32 v19, s15
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s12
+; GCN-NEXT:    v_mov_b32_e32 v33, s13
+; GCN-NEXT:    v_mov_b32_e32 v34, s14
+; GCN-NEXT:    v_mov_b32_e32 v35, s15
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s8
-; GCN-NEXT:    v_mov_b32_e32 v17, s9
-; GCN-NEXT:    v_mov_b32_e32 v18, s10
-; GCN-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s8
+; GCN-NEXT:    v_mov_b32_e32 v33, s9
+; GCN-NEXT:    v_mov_b32_e32 v34, s10
+; GCN-NEXT:    v_mov_b32_e32 v35, s11
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -332,28 +331,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v42, s22
 ; GCN-NEXT:    v_mov_b32_e32 v43, s23
+; GCN-NEXT:    v_mov_b32_e32 v32, s16
+; GCN-NEXT:    v_mov_b32_e32 v33, s17
+; GCN-NEXT:    v_mov_b32_e32 v34, s18
+; GCN-NEXT:    v_mov_b32_e32 v35, s19
 ; GCN-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    v_mov_b32_e32 v16, s16
-; GCN-NEXT:    v_mov_b32_e32 v17, s17
-; GCN-NEXT:    v_mov_b32_e32 v18, s18
-; GCN-NEXT:    v_mov_b32_e32 v19, s19
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-NEXT:    v_mov_b32_e32 v17, s13
-; GCN-NEXT:    v_mov_b32_e32 v18, s14
-; GCN-NEXT:    v_mov_b32_e32 v19, s15
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s12
+; GCN-NEXT:    v_mov_b32_e32 v33, s13
+; GCN-NEXT:    v_mov_b32_e32 v34, s14
+; GCN-NEXT:    v_mov_b32_e32 v35, s15
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s8
-; GCN-NEXT:    v_mov_b32_e32 v17, s9
-; GCN-NEXT:    v_mov_b32_e32 v18, s10
-; GCN-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s8
+; GCN-NEXT:    v_mov_b32_e32 v33, s9
+; GCN-NEXT:    v_mov_b32_e32 v34, s10
+; GCN-NEXT:    v_mov_b32_e32 v35, s11
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 753206206180a..ebab4891d7da6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1508,28 +1508,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; SDAG-NEXT:    v_mov_b32_e32 v42, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v43, s23
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1611,28 +1610,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; HEURRC-NEXT:    v_mov_b32_e32 v42, s22
 ; HEURRC-NEXT:    v_mov_b32_e32 v43, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 2
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -1668,28 +1666,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; VGPRRC-NEXT:    v_mov_b32_e32 v42, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 2
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -1850,28 +1847,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; SDAG-NEXT:    v_mov_b32_e32 v42, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v43, s23
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1953,28 +1949,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; HEURRC-NEXT:    v_mov_b32_e32 v42, s22
 ; HEURRC-NEXT:    v_mov_b32_e32 v43, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 2
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -2010,28 +2005,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; VGPRRC-NEXT:    v_mov_b32_e32 v42, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 2
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -3191,13 +3185,9 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3218,14 +3208,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3603,13 +3593,9 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3630,14 +3616,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
@@ -4150,33 +4136,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4260,33 +4245,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; HEURRC-NEXT:    s_nop 6
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4324,33 +4308,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT:    s_nop 6
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4527,33 +4510,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4637,33 +4619,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT:    s_nop 6
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4701,33 +4682,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT:    s_nop 6
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
new file mode 100644
index 0000000000000..271b36fad2bb4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
@@ -0,0 +1,1292 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck -check-prefix=GFX942_WITHOUT %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=true %s -o - | FileCheck -check-prefix=GFX942_WITH %s
+
+--- |
+  target triple = "amdgcn-amd-amdhsa"
+
+  define amdgpu_kernel void @test_software_pipelining() #0 {
+    bb.0:
+      ret void
+  }
+
+  attributes #0 = {nounwind "amdgpu-waves-per-eu"="2"  "amdgpu-agpr-alloc"="0" "frame-pointer"="none"}
+
+...
+---
+name:            test_software_pipelining
+body:             |
+  bb.0:
+    ; GFX942_WITHOUT-LABEL: name: test_software_pipelining
+    ; GFX942_WITHOUT: renamable $vgpr115 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr109 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr110 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr108 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr111 = V_ADD_U32_e32 4096, $vgpr100, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr101 = V_ADD_U32_e32 $vgpr76, killed $vgpr52, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr112 = V_ADD_U32_e32 4096, $vgpr101, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr112, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 renamable $vgpr108, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr80_vgpr81, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr76, killed $vgpr0, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr82_vgpr83, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr92_vgpr93, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr94_vgpr95, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr108, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr80_vgpr81, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr82_vgpr83, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr108, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr80_vgpr81, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr72_vgpr73_vgpr74_vgpr75, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr82_vgpr83, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr92_vgpr93, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr94_vgpr95, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = DS_READ_B128_gfx9 renamable $vgpr108, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr80_vgpr81, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr108, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr50_vgpr51, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr92_vgpr93, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr50_vgpr51, $vgpr94_vgpr95, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr110, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, killed $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr92_vgpr93, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, killed $vgpr94_vgpr95, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr120 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr104_vgpr105, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr106_vgpr107, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr104_vgpr105, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr114 = V_ADD_U32_e32 $vgpr115, killed $vgpr16, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr104_vgpr105, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr113 = V_ADD_U32_e32 $vgpr115, killed $vgpr20, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr106_vgpr107, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr102_vgpr103, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr104_vgpr105, killed $vgpr96_vgpr97_vgpr98_vgpr99, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr78_vgpr79, $vgpr106_vgpr107, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr78_vgpr79, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr118_vgpr119_vgpr120_vgpr121 = DS_READ_B128_gfx9 killed renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr104_vgpr105, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr116 = V_ADD_U32_e32 $vgpr115, killed $vgpr56, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr106_vgpr107, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr100_vgpr101, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr115 = V_ADD_U32_e32 killed $vgpr115, killed $vgpr72, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr102_vgpr103, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr115, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279
+    ; GFX942_WITHOUT-NEXT: S_BARRIER
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr104_vgpr105, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr120_vgpr121, killed $vgpr106_vgpr107, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr120_vgpr121, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = DS_READ_B128_gfx9 renamable $vgpr108, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr106_vgpr107, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr104_vgpr105_vgpr106_vgpr107 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr112, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr108, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr2_vgpr3, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 killed renamable $vgpr110, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = DS_READ_B128_gfx9 renamable $vgpr92, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr92, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 renamable $vgpr92, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr82_vgpr83, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr82_vgpr83, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr92, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr115, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, $vgpr10_vgpr11, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr92, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr92, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr109, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr10_vgpr11, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr12_vgpr13, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr14_vgpr15, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 killed renamable $vgpr92, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr8_vgpr9, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr10_vgpr11, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279
+    ; GFX942_WITHOUT-NEXT: S_BARRIER
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr108, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = DS_READ_B128_gfx9 killed renamable $vgpr108, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0
+    ; GFX942_WITHOUT-NEXT: S_ENDPGM 0
+    ;
+    ; GFX942_WITH-LABEL: name: test_software_pipelining
+    ; GFX942_WITH: renamable $vgpr96 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr121 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr122 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr52 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr120 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr123 = V_ADD_U32_e32 4096, $vgpr97, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr102 = V_ADD_U32_e32 $vgpr52, killed $vgpr0, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr124 = V_ADD_U32_e32 4096, $vgpr102, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr124, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr52, killed $vgpr0, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr6_vgpr7, $vgpr82_vgpr83, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr92_vgpr93, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr6_vgpr7, $vgpr94_vgpr95, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr122, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr80_vgpr81, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, killed $vgpr82_vgpr83, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, killed $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr97, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr112_vgpr113, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr97, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr112_vgpr113, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr112_vgpr113, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr126 = V_ADD_U32_e32 $vgpr96, killed $vgpr16, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr100_vgpr101, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr104_vgpr105_vgpr106_vgpr107, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr125 = V_ADD_U32_e32 $vgpr96, killed $vgpr20, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr116_vgpr117_vgpr118_vgpr119, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr108_vgpr109_vgpr110_vgpr111, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr104_vgpr105_vgpr106_vgpr107, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 killed renamable $vgpr97, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr104 = V_ADD_U32_e32 $vgpr96, killed $vgpr56, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr127 = V_ADD_U32_e32 killed $vgpr96, killed $vgpr60, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr127, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: S_WAITCNT 49279
+    ; GFX942_WITH-NEXT: S_BARRIER
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr112_vgpr113, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, killed $vgpr114_vgpr115, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_BARRIER 0
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr124, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 killed renamable $vgpr122, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr105 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr105, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 renamable $vgpr105, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr105, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr84_vgpr85_vgpr86_vgpr87 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr105, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr88_vgpr89_vgpr90_vgpr91 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr127, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr105, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr105, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr121, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 killed renamable $vgpr105, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr8_vgpr9, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr12_vgpr13, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: S_WAITCNT 49279
+    ; GFX942_WITH-NEXT: S_BARRIER
+    ; GFX942_WITH-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = DS_READ_B128_gfx9 killed renamable $vgpr120, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_BARRIER 0
+    ; GFX942_WITH-NEXT: S_ENDPGM 0
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = IMPLICIT_DEF
+    %5:sgpr_128 = IMPLICIT_DEF
+    %6:sgpr_128 = IMPLICIT_DEF
+    %7:vgpr_32 = IMPLICIT_DEF
+    %8:vreg_128_align2 = IMPLICIT_DEF
+    %9:vreg_128_align2 = IMPLICIT_DEF
+    %10:vreg_128_align2 = IMPLICIT_DEF
+    %11:vreg_128_align2 = IMPLICIT_DEF
+    %12:vreg_128_align2 = IMPLICIT_DEF
+    %13:vreg_128_align2 = IMPLICIT_DEF
+    %14:vreg_128_align2 = IMPLICIT_DEF
+    %15:vreg_128_align2 = IMPLICIT_DEF
+    %16:vreg_128_align2 = IMPLICIT_DEF
+    %17:vreg_128_align2 = IMPLICIT_DEF
+    %18:vreg_128_align2 = IMPLICIT_DEF
+    %19:vreg_128_align2 = IMPLICIT_DEF
+    %20:vreg_128_align2 = IMPLICIT_DEF
+    %21:vreg_128_align2 = IMPLICIT_DEF
+    %22:vreg_128_align2 = IMPLICIT_DEF
+    %23:vreg_128_align2 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = V_ADD_U32_e32 4096, %25:vgpr_32, implicit $exec
+    %27:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %7:vgpr_32, implicit $exec
+    %26:vgpr_32 = V_ADD_U32_e32 4096, %27:vgpr_32, implicit $exec
+    %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %31:vreg_128_align2 = IMPLICIT_DEF
+    %30:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %23:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %32:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %30:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %22:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %34:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %37:vreg_128_align2 = IMPLICIT_DEF
+    %36:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %21:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %38:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %36:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %20:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %40:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %19:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %43:vgpr_32 = IMPLICIT_DEF
+    %925:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %43:vgpr_32, implicit $exec
+    %44:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %45:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %42:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %46:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %18:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %47:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %46:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %48:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    %49:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %17:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %50:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %51:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %49:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %52:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %16:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %53:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %52:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %54:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    %55:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %15:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %56:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %56:vreg_128_align2, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    %57:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %55:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %58:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %14:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %59:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %58:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %60:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    %61:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %13:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %62:vreg_128_align2, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    %63:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %61:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %64:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %12:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %65:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %64:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %66:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    %67:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %11:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %68:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %69:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %67:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %70:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %10:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %71:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %70:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %72:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    %73:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %9:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %74:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %75:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %73:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %76:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %8:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %77:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %76:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %79:vgpr_32 = IMPLICIT_DEF
+    %78:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    %81:vreg_128_align2 = IMPLICIT_DEF
+    %80:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %33:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %82:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    %83:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %80:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %35:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %84:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %39:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    %89:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %87:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %41:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %90:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %45:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %94:vgpr_32 = IMPLICIT_DEF
+    %93:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %94:vgpr_32, implicit $exec
+    %95:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %96:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %92:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %97:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %47:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %98:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %97:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %99:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    %100:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %51:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %102:vgpr_32 = IMPLICIT_DEF
+    %101:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %102:vgpr_32, implicit $exec
+    %103:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %104:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %100:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %105:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %53:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %106:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %105:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %107:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    %108:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %57:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %109:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %109:vreg_128_align2, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
+    %110:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %108:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %111:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %59:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %112:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %111:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %113:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    %114:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %63:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %115:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %115:vreg_128_align2, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
+    %116:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %114:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %117:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %65:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %118:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %117:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %119:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    %120:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %69:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %122:vgpr_32 = IMPLICIT_DEF
+    %121:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %122:vgpr_32, implicit $exec
+    %123:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %124:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %120:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %125:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %71:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %127:vgpr_32 = IMPLICIT_DEF
+    %126:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %127:vgpr_32, implicit $exec
+    %128:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %125:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %129:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    S_WAITCNT 49279
+    S_BARRIER
+    %130:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    %131:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %75:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %132:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %131:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %133:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %77:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %134:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %133:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %135:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_BARRIER 0
+    %136:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %83:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %137:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    %138:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %136:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %139:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %86:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %140:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %139:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %141:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %89:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %142:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    %143:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %141:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %144:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %91:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %145:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %144:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %146:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %96:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %147:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %146:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %148:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %98:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %149:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %148:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %150:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    %151:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %104:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %152:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %151:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %153:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %106:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %154:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %153:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %155:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    %156:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %110:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %95:vreg_128_align2, 0, 0, implicit $exec :: (store (s128), addrspace 3)
+    %157:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %156:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %158:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %112:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %159:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %158:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %160:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    %161:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %116:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %103:vreg_128_align2, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
+    %162:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %161:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %163:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %118:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %164:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %163:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %165:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    %166:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %124:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %981:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %167:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %166:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %168:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %128:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %169:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %168:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %170:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    %171:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %132:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %985:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %172:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %171:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %173:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %134:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %174:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %173:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %176:vgpr_32 = IMPLICIT_DEF
+    %175:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    %177:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %138:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %178:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    %179:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %177:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %180:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %140:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %962:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %180:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %182:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %143:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %183:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    %961:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %182:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %185:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %145:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %960:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %185:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %187:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %147:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %956:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %959:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %187:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %189:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %149:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %958:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %189:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %191:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    %192:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %152:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %962:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %957:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %192:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %194:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %154:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %956:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %194:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %196:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    %197:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %157:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %123:vreg_128_align2, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
+    %955:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %197:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %199:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %159:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %954:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %199:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %201:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    %202:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %162:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %129:vreg_128_align2, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
+    %953:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %202:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %204:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %164:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %952:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %204:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %206:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    %207:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %167:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %910:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %951:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %207:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %209:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %169:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %950:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %209:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %911:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    S_WAITCNT 49279
+    S_BARRIER
+    %937:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    %211:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %172:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %949:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %211:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %213:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %174:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %948:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %213:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %931:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_BARRIER 0
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 7e30af96bb8b9..d9f1b542e4cb4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -799,17 +799,17 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1155,8 +1155,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2005,21 +2005,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -2395,21 +2395,21 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -3304,17 +3304,17 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
@@ -3494,19 +3494,19 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1)
 ;
 ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
 ; GFX942-VGPR:       ; %bb.0:
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x41
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3)
@@ -4309,7 +4309,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4318,9 +4318,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v6, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -5017,12 +5017,12 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v0, v1, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -5542,6 +5542,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v1
@@ -5570,39 +5572,37 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[30:31]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[30:31], v[28:29]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[28:29], v[26:27]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[26:27], v[24:25]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[24:25], v[22:23]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[22:23], v[20:21]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[20:21], v[18:19]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], v[16:17]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[62:63], v[30:31]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v64, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[60:61], v[28:29]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[58:59], v[26:27]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[56:57], v[24:25]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[54:55], v[22:23]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[52:53], v[20:21]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[50:51], v[18:19]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[48:49], v[16:17]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[46:47], v[14:15]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[44:45], v[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[42:43], v[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[40:41], v[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[38:39], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[36:37], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[34:35], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[0:1]
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[32:63], v0, v64, v[32:63]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[60:63], s[0:1] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[56:59], s[0:1] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[52:55], s[0:1] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[48:51], s[0:1] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[44:47], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[40:43], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[36:39], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[32:35], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
@@ -5695,20 +5695,20 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -5804,19 +5804,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index f0205a3a788ed..f4f1ca024b7d6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -5101,35 +5101,35 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; SDAG-NEXT:    v_mov_b64_e32 v[20:21], 48
 ; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[24:25], 16
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[38:39], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5137,6 +5137,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
@@ -5154,28 +5157,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], 48
+; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
@@ -5191,23 +5199,23 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-NEXT:    v_mov_b32_e32 v32, 42
 ; SDAG-NEXT:    v_mov_b32_e32 v33, 25
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    v_mov_b32_e32 v20, s16
-; SDAG-NEXT:    v_mov_b32_e32 v21, s17
-; SDAG-NEXT:    v_mov_b32_e32 v22, s18
-; SDAG-NEXT:    v_mov_b32_e32 v23, s19
-; SDAG-NEXT:    v_mov_b32_e32 v24, s20
-; SDAG-NEXT:    v_mov_b32_e32 v25, s21
-; SDAG-NEXT:    v_mov_b32_e32 v26, s22
-; SDAG-NEXT:    v_mov_b32_e32 v27, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s12
+; SDAG-NEXT:    v_mov_b32_e32 v1, s13
+; SDAG-NEXT:    v_mov_b32_e32 v2, s14
+; SDAG-NEXT:    v_mov_b32_e32 v3, s15
+; SDAG-NEXT:    v_mov_b32_e32 v4, s16
+; SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; SDAG-NEXT:    v_mov_b32_e32 v6, s18
+; SDAG-NEXT:    v_mov_b32_e32 v7, s19
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
+; SDAG-NEXT:    v_mov_b32_e32 v9, s21
+; SDAG-NEXT:    v_mov_b32_e32 v10, s22
+; SDAG-NEXT:    v_mov_b32_e32 v11, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v28, s24
-; SDAG-NEXT:    v_mov_b32_e32 v29, s25
-; SDAG-NEXT:    v_mov_b32_e32 v30, s26
-; SDAG-NEXT:    v_mov_b32_e32 v31, s27
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v13, s25
+; SDAG-NEXT:    v_mov_b32_e32 v14, s26
+; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
@@ -5242,19 +5250,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5265,6 +5287,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    v_mov_b32_e32 v32, 25
 ; GISEL-NEXT:    v_mov_b32_e32 v33, 42
 ; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[38:39]
@@ -5296,20 +5321,20 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 5475fa2ae5c6e..ef3bb0cb5f4f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-SDAG-NEXT:    s_nop 1
-; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    s_nop 6
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32_vgprcd:
@@ -87,14 +87,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, 4.0
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-GISEL-NEXT:    s_nop 1
-; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-GISEL-NEXT:    s_nop 5
-; GFX942-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    s_nop 6
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 6eb9449069a52..e7c8465b9fbe3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -440,11 +440,13 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v26, v10
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s28
+; SDAG-NEXT:    v_mov_b32_e32 v15, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -453,17 +455,16 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 11
@@ -783,11 +784,13 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-NEXT:    v_mov_b32_e32 v37, s1
 ; GCN-NEXT:    v_mov_b32_e32 v38, s2
 ; GCN-NEXT:    v_mov_b32_e32 v39, s3
-; GCN-NEXT:    v_mov_b32_e32 v13, s25
-; GCN-NEXT:    v_mov_b32_e32 v14, s26
-; GCN-NEXT:    v_mov_b32_e32 v15, s27
-; GCN-NEXT:    v_mov_b32_e32 v16, s28
-; GCN-NEXT:    v_mov_b32_e32 v17, s29
+; GCN-NEXT:    v_mov_b32_e32 v26, v10
+; GCN-NEXT:    v_mov_b32_e32 v10, s24
+; GCN-NEXT:    v_mov_b32_e32 v11, s25
+; GCN-NEXT:    v_mov_b32_e32 v12, s26
+; GCN-NEXT:    v_mov_b32_e32 v13, s27
+; GCN-NEXT:    v_mov_b32_e32 v14, s28
+; GCN-NEXT:    v_mov_b32_e32 v15, s29
 ; GCN-NEXT:    v_mov_b32_e32 v28, s16
 ; GCN-NEXT:    v_mov_b32_e32 v29, s17
 ; GCN-NEXT:    v_mov_b32_e32 v30, s18
@@ -796,17 +799,16 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-NEXT:    v_mov_b32_e32 v33, s21
 ; GCN-NEXT:    v_mov_b32_e32 v34, s22
 ; GCN-NEXT:    v_mov_b32_e32 v35, s23
-; GCN-NEXT:    v_mov_b32_e32 v12, s24
-; GCN-NEXT:    v_mov_b32_e32 v18, v0
-; GCN-NEXT:    v_mov_b32_e32 v19, v1
-; GCN-NEXT:    v_mov_b32_e32 v20, v2
-; GCN-NEXT:    v_mov_b32_e32 v21, v3
-; GCN-NEXT:    v_mov_b32_e32 v22, v4
-; GCN-NEXT:    v_mov_b32_e32 v23, v5
-; GCN-NEXT:    v_mov_b32_e32 v24, v6
-; GCN-NEXT:    v_mov_b32_e32 v25, v7
-; GCN-NEXT:    v_mov_b32_e32 v26, v8
-; GCN-NEXT:    v_mov_b32_e32 v27, v9
+; GCN-NEXT:    v_mov_b32_e32 v16, v0
+; GCN-NEXT:    v_mov_b32_e32 v17, v1
+; GCN-NEXT:    v_mov_b32_e32 v18, v2
+; GCN-NEXT:    v_mov_b32_e32 v19, v3
+; GCN-NEXT:    v_mov_b32_e32 v20, v4
+; GCN-NEXT:    v_mov_b32_e32 v21, v5
+; GCN-NEXT:    v_mov_b32_e32 v22, v6
+; GCN-NEXT:    v_mov_b32_e32 v23, v7
+; GCN-NEXT:    v_mov_b32_e32 v24, v8
+; GCN-NEXT:    v_mov_b32_e32 v25, v9
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
 ; GCN-NEXT:    s_nop 11
@@ -1279,11 +1281,13 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v26, v10
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s28
+; SDAG-NEXT:    v_mov_b32_e32 v15, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -1292,17 +1296,16 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 11
@@ -2322,11 +2325,13 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v26, v10
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s28
+; SDAG-NEXT:    v_mov_b32_e32 v15, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -2335,17 +2340,16 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 11
@@ -2689,11 +2693,13 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v26, v10
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s28
+; SDAG-NEXT:    v_mov_b32_e32 v15, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -2702,17 +2708,16 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 11
@@ -3056,11 +3061,13 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v26, v10
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s28
+; SDAG-NEXT:    v_mov_b32_e32 v15, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -3069,17 +3076,16 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 11
@@ -3423,11 +3429,13 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v26, v10
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s28
+; SDAG-NEXT:    v_mov_b32_e32 v15, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -3436,17 +3444,16 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 11
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index b9e9893ede4e2..e422e90ea0271 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -373,7 +373,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v22, 0x7fc00000
 ; CHECK-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
 ; CHECK-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x3c003c00
@@ -382,69 +382,65 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x7e007e00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[12:13], v[4:7]
-; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    v_mov_b32_e32 v23, v22
+; CHECK-NEXT:    v_mov_b32_e32 v24, v22
+; CHECK-NEXT:    v_mov_b32_e32 v25, v22
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7]
+; CHECK-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25]
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
 ; CHECK-NEXT:    v_accvgpr_write_b32 a2, v2
 ; CHECK-NEXT:    v_accvgpr_write_b32 a3, v3
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; CHECK-NEXT:    v_mov_b32_e32 v5, v4
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
-; CHECK-NEXT:    v_mov_b32_e32 v7, v4
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17]
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[4:7]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7]
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[4:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7]
+; CHECK-NEXT:    v_mov_b64_e32 v[30:31], 0
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[8:9], v[8:9], v[26:29]
 ; CHECK-NEXT:    s_nop 5
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v23, v14
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[18:21]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3]
-; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    global_store_short v[30:31], v23, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3]
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_read_b32 v19, a3
 ; CHECK-NEXT:    v_accvgpr_read_b32 v18, a2
-; CHECK-NEXT:    v_mov_b64_e32 v[20:21], 0
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_wbl2 sc0 sc1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_inv sc0 sc1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v17, a1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v16, a0
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v15, v22
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[8:9], v[8:9], v[16:19]
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v12, v0
-; CHECK-NEXT:    global_store_short v[20:21], v23, off
+; CHECK-NEXT:    global_store_short v[30:31], v15, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[10:11], v[8:9], v[4:7]
-; CHECK-NEXT:    global_store_short v[20:21], v15, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3]
+; CHECK-NEXT:    global_store_short v[30:31], v14, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v14, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[10:11], v[8:9], v[4:7]
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v16
+; CHECK-NEXT:    global_store_short v[30:31], v14, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v12, v0
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v14, off
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    buffer_wbl2 sc0 sc1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v12, off
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v13, v8
+; CHECK-NEXT:    global_store_short v[30:31], v12, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v0, off
+; CHECK-NEXT:    global_store_short v[30:31], v13, off
 ; CHECK-NEXT:    s_endpgm
 entry:
   %k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
@@ -514,13 +510,13 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5]
+; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[6:9], v0, v1, v[2:5]
 ; CHECK-NEXT:    s_nop 3
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[8:9] op_sel:[1,0]
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
-; CHECK-NEXT:    v_accvgpr_write_b32 a2, v3
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, v9
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use a[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -642,46 +638,14 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
 ; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    v_accvgpr_read_b32 v24, a24
-; CHECK-NEXT:    v_accvgpr_read_b32 v25, a25
-; CHECK-NEXT:    v_accvgpr_read_b32 v26, a26
-; CHECK-NEXT:    v_accvgpr_read_b32 v27, a27
-; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
-; CHECK-NEXT:    v_accvgpr_read_b32 v2, a2
-; CHECK-NEXT:    v_accvgpr_read_b32 v3, a3
-; CHECK-NEXT:    v_accvgpr_read_b32 v4, a4
-; CHECK-NEXT:    v_accvgpr_read_b32 v5, a5
-; CHECK-NEXT:    v_accvgpr_read_b32 v6, a6
-; CHECK-NEXT:    v_accvgpr_read_b32 v7, a7
-; CHECK-NEXT:    v_accvgpr_read_b32 v8, a8
-; CHECK-NEXT:    v_accvgpr_read_b32 v9, a9
-; CHECK-NEXT:    v_accvgpr_read_b32 v10, a10
-; CHECK-NEXT:    v_accvgpr_read_b32 v11, a11
-; CHECK-NEXT:    v_accvgpr_read_b32 v12, a12
-; CHECK-NEXT:    v_accvgpr_read_b32 v13, a13
-; CHECK-NEXT:    v_accvgpr_read_b32 v14, a14
-; CHECK-NEXT:    v_accvgpr_read_b32 v15, a15
-; CHECK-NEXT:    v_accvgpr_read_b32 v16, a16
-; CHECK-NEXT:    v_accvgpr_read_b32 v17, a17
-; CHECK-NEXT:    v_accvgpr_read_b32 v18, a18
-; CHECK-NEXT:    v_accvgpr_read_b32 v19, a19
-; CHECK-NEXT:    v_accvgpr_read_b32 v20, a20
-; CHECK-NEXT:    v_accvgpr_read_b32 v21, a21
-; CHECK-NEXT:    v_accvgpr_read_b32 v22, a22
-; CHECK-NEXT:    v_accvgpr_read_b32 v23, a23
-; CHECK-NEXT:    v_accvgpr_read_b32 v28, a28
-; CHECK-NEXT:    v_accvgpr_read_b32 v29, a29
-; CHECK-NEXT:    v_accvgpr_read_b32 v30, a30
-; CHECK-NEXT:    v_accvgpr_read_b32 v31, a31
-; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; CHECK-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; CHECK-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; CHECK-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
-; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v32, a[24:27], s[2:3] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v32, a[28:31], s[2:3] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v32, a[16:19], s[2:3] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v32, a[20:23], s[2:3] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v32, a[8:11], s[2:3] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v32, a[12:15], s[2:3] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v32, a[0:3], s[2:3]
+; CHECK-NEXT:    global_store_dwordx4 v32, a[4:7], s[2:3] offset:16
 ; CHECK-NEXT:    s_endpgm
   %src2 = call <32 x float> asm sideeffect "; def $0", "=a"()
   %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0)
@@ -763,15 +727,18 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v12, v31
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1]
-; CHECK-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
+; CHECK-NEXT:    v_and_b32_e32 v12, 0x3ff, v12
+; CHECK-NEXT:    s_nop 2
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
 ; CHECK-NEXT:    s_nop 8
 ; CHECK-NEXT:    global_store_dwordx2 v[2:3], a[0:1], off
+; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 3, v12
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5]
+; CHECK-NEXT:    s_nop 5
+; CHECK-NEXT:    global_store_dwordx2 v[4:5], a[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %src2 = call double asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
index a81d9a458e23a..e77856d073a0b 100644
--- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
@@ -311,43 +311,44 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[12:15]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    global_store_dwordx4 v0, v[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v6, v[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[56:59], s[16:17] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v6, v[56:59], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v6, v[52:55], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v6, v[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v6, v[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v6, v[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v6, v[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v6, v[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[56:59], s[16:17] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v6, a[56:59], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v6, a[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v6, a[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v6, a[52:55], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v6, a[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v6, a[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v6, a[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v6, a[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[8:11], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)

>From 504a2657f83264e2ba6a13122a302722c2e54eba Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 19 Sep 2025 19:47:42 -0400
Subject: [PATCH 02/18] Rebase

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 6d2b10bdb5804..ed349fccfa3e4 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -48,7 +48,7 @@ static cl::opt<bool> EnableRegisterAvoidListForMFMARegs(
     "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden,
     cl::desc("Enable Register Avoidance for "
              "MFMA in GCNPreRAOptimizations stage."),
-    cl::init(true));
+    cl::init(false));
 
 namespace {
 

>From df7cbbfbb6e93a3e9b52ddc666f7b1a1533ba850 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 19 Sep 2025 20:05:14 -0400
Subject: [PATCH 03/18] rebase test files

---
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 523 ++++++++---------
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 542 +++++++++---------
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll | 184 +++---
 .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll    |  62 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 456 ++++++++-------
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  | 146 ++---
 ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 159 +++--
 .../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll    |  12 +-
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       | 231 ++++----
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       | 123 ++--
 .../unspill-vgpr-after-rewrite-vgpr-mfma.ll   |  33 +-
 11 files changed, 1233 insertions(+), 1238 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index d4380fd41310a..b07dec326327e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -15,12 +15,9 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr106
   ; GCN-NEXT:    ; implicit-def: $vgpr132
-  ; GCN-NEXT:    ; implicit-def: $vgpr112
-  ; GCN-NEXT:    ; implicit-def: $vgpr113
-  ; GCN-NEXT:    ; implicit-def: $vgpr114
-  ; GCN-NEXT:    ; implicit-def: $vgpr115
   ; GCN-NEXT:    ; implicit-def: $vgpr133
   ; GCN-NEXT:    ; implicit-def: $vgpr139
+  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -170,45 +167,46 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
+  ; GCN-NEXT:    ; implicit-def: $vgpr73
+  ; GCN-NEXT:    v_add_u32_e32 v76, v132, v64
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
-  ; GCN-NEXT:    v_add_u32_e32 v73, v132, v112
   ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; kill: killed $vgpr72
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v113
-  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v73, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v73
+  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v114
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; implicit-def: $vgpr74
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v74
+  ; GCN-NEXT:    ; implicit-def: $vgpr75
   ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v115
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v75
   ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; kill: killed $vgpr73
   ; GCN-NEXT:    ds_read_b128 v[72:75], v94
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ; implicit-def: $sgpr8
-  ; GCN-NEXT:    ; implicit-def: $vgpr112
-  ; GCN-NEXT:    ; implicit-def: $vgpr113
-  ; GCN-NEXT:    ; implicit-def: $vgpr114
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
   ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
@@ -413,6 +411,8 @@
   ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
   ; GCN-NEXT:    ; implicit-def: $vgpr65
   ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    ; implicit-def: $vgpr68
+  ; GCN-NEXT:    ; implicit-def: $vgpr67
   ; GCN-NEXT:    v_add_u32_e32 v65, s7, v65
   ; GCN-NEXT:    v_and_b32_e32 v65, 0x1fffffff, v65
   ; GCN-NEXT:    v_mul_lo_u32 v65, v65, s6
@@ -440,36 +440,40 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v138, v[96:97]
-  ; GCN-NEXT:    ; implicit-def: $vgpr96
+  ; GCN-NEXT:    v_add_u32_e32 v68, v132, v68
   ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[6:7]
   ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
   ; GCN-NEXT:    ; implicit-def: $vgpr65
   ; GCN-NEXT:    v_max_f32_e32 v66, v65, v65
   ; GCN-NEXT:    v_max_f32_e32 v134, v66, v64
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v96
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[160:161], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v112
-  ; GCN-NEXT:    buffer_load_dwordx2 v[162:163], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v64
+  ; GCN-NEXT:    buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v113
+  ; GCN-NEXT:    ; implicit-def: $vgpr66
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v66
   ; GCN-NEXT:    buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v114
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v67
   ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v134
   ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
+  ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v134
+  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
   ; GCN-NEXT:    v_fma_f32 v64, s4, v49, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_exp_f32_e32 v163, v57
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
   ; GCN-NEXT:    v_fma_f32 v66, s4, v50, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v165, v57
+  ; GCN-NEXT:    v_exp_f32_e32 v164, v57
   ; GCN-NEXT:    v_exp_f32_e32 v49, v48
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64
   ; GCN-NEXT:    v_fma_f32 v67, s4, v51, -v134
@@ -495,30 +499,31 @@
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v70
   ; GCN-NEXT:    v_exp_f32_e32 v55, v48
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v71
-  ; GCN-NEXT:    v_fma_f32 v66, s4, v56, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v56, v48
-  ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v134
   ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v66, s4, v56, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v56, v48
+  ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v49
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v67, v50
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v51
-  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v58, v52
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
   ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_exp_f32_e32 v48, v48
-  ; GCN-NEXT:    v_fma_f32 v156, s4, v59, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v59, v68, v58
-  ; GCN-NEXT:    v_pack_b32_f16 v58, v64, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v80, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    v_pack_b32_f16 v161, v68, v58
+  ; GCN-NEXT:    v_pack_b32_f16 v160, v64, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v58, 0x3fb8aa3b, v66
   ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ds_read_b128 v[152:155], v139 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v56
   ; GCN-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
@@ -527,15 +532,9 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
-  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
-  ; GCN-NEXT:    v_fma_f32 v157, s4, v60, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[58:59], v[64:79]
-  ; GCN-NEXT:    v_exp_f32_e32 v141, v80
   ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
-  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134
   ; GCN-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
-<<<<<<< HEAD
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e64 v82, v82, v48
   ; GCN-NEXT:    v_mul_f32_e64 v83, v83, v48
@@ -543,16 +542,10 @@
   ; GCN-NEXT:    v_mul_f32_e64 v85, v85, v48
   ; GCN-NEXT:    v_mul_f32_e64 v86, v86, v48
   ; GCN-NEXT:    v_mul_f32_e64 v87, v87, v48
-=======
-  ; GCN-NEXT:    v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
->>>>>>> ee1ade05012a ([AMDGPU] Improve register allocation to reduce MFMA hazard NOPs)
   ; GCN-NEXT:    v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
-<<<<<<< HEAD
   ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
   ; GCN-NEXT:    v_exp_f32_e32 v58, v58
   ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
@@ -563,17 +556,13 @@
   ; GCN-NEXT:    v_mul_f32_e64 v101, v101, v48
   ; GCN-NEXT:    v_mul_f32_e64 v102, v102, v48
   ; GCN-NEXT:    v_mul_f32_e64 v103, v103, v48
-=======
-  ; GCN-NEXT:    v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
->>>>>>> ee1ade05012a ([AMDGPU] Improve register allocation to reduce MFMA hazard NOPs)
   ; GCN-NEXT:    v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v57
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v59
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53
-<<<<<<< HEAD
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54
   ; GCN-NEXT:    v_exp_f32_e32 v59, v57
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
@@ -582,264 +571,249 @@
   ; GCN-NEXT:    v_mul_f32_e64 v113, v113, v48
   ; GCN-NEXT:    v_mul_f32_e64 v114, v114, v48
   ; GCN-NEXT:    v_mul_f32_e64 v115, v115, v48
-=======
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[58:59], v[80:95]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v144, v54
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v145, v55
-  ; GCN-NEXT:    v_exp_f32_e32 v167, v57
-  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
-  ; GCN-NEXT:    v_mul_f32_e32 v168, 0x3fb8aa3b, v157
-  ; GCN-NEXT:    v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
->>>>>>> ee1ade05012a ([AMDGPU] Improve register allocation to reduce MFMA hazard NOPs)
   ; GCN-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[58:59], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v148, v56
   ; GCN-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pack_b32_f16 v149, v145, v148
-  ; GCN-NEXT:    v_pack_b32_f16 v148, v140, v144
-  ; GCN-NEXT:    v_mul_f32_e32 v140, 0x3fb8aa3b, v156
-  ; GCN-NEXT:    v_exp_f32_e32 v168, v168
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[58:59], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v153, v140
-  ; GCN-NEXT:    ; implicit-def: $vgpr140
-  ; GCN-NEXT:    v_fma_f32 v164, s4, v61, -v134
-  ; GCN-NEXT:    v_fma_f32 v166, s4, v62, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v169, v141
+  ; GCN-NEXT:    v_fma_f32 v148, s4, v62, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
   ; GCN-NEXT:    v_fma_f32 v152, s4, v63, -v134
-  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
-  ; GCN-NEXT:    v_fma_f32 v57, s4, v35, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[148:149], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[142:145], v140
+  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v60
+  ; GCN-NEXT:    ; implicit-def: $vgpr57
+  ; GCN-NEXT:    ds_read_b128 v[60:63], v57
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[156:159], v140 offset:576
+  ; GCN-NEXT:    v_exp_f32_e32 v160, v149
+  ; GCN-NEXT:    v_fma_f32 v161, s4, v33, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v148
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
+  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
+  ; GCN-NEXT:    ds_read_b128 v[140:143], v57 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_fma_f32 v40, s4, v40, -v134
   ; GCN-NEXT:    v_fma_f32 v44, s4, v44, -v134
   ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v134
+  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134
   ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v162
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v163
+  ; GCN-NEXT:    v_exp_f32_e32 v162, v146
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v164
   ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[148:149], v[80:95]
-  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v164
-  ; GCN-NEXT:    v_fma_f32 v164, s4, v33, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v166
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v165
-  ; GCN-NEXT:    v_exp_f32_e32 v170, v146
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v167
+  ; GCN-NEXT:    v_pack_b32_f16 v148, v153, v147
   ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[148:149], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
   ; GCN-NEXT:    v_exp_f32_e32 v151, v33
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v153
-  ; GCN-NEXT:    v_pack_b32_f16 v62, v169, v147
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v59
   ; GCN-NEXT:    v_fma_f32 v150, s4, v34, -v134
-  ; GCN-NEXT:    v_perm_b32 v147, v131, v129, s8
-  ; GCN-NEXT:    v_pack_b32_f16 v63, v146, v33
-  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v152
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[148:149], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v148, v33
-  ; GCN-NEXT:    v_fma_f32 v152, s4, v36, -v134
-  ; GCN-NEXT:    v_perm_b32 v36, v162, v160, s5
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v149, v168
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v155, v170
-  ; GCN-NEXT:    v_perm_b32 v146, v163, v161, s8
   ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[62:63], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v32
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v140 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[58:61], v140 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v164
-  ; GCN-NEXT:    v_exp_f32_e32 v154, v142
-  ; GCN-NEXT:    v_perm_b32 v142, v162, v160, s8
-  ; GCN-NEXT:    v_fma_f32 v160, s4, v38, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[156:157], v[62:63], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v157, v143
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v148
-  ; GCN-NEXT:    v_fma_f32 v156, s4, v37, -v134
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v149, v146, v33
+  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v152
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
+  ; GCN-NEXT:    v_fma_f32 v152, s4, v35, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v153, v33
+  ; GCN-NEXT:    v_fma_f32 v155, s4, v36, -v134
+  ; GCN-NEXT:    v_perm_b32 v36, v158, v156, s5
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v154, v160
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v60, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[144:147], v57 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v61, 0x3fb8aa3b, v161
+  ; GCN-NEXT:    v_exp_f32_e32 v165, v60
+  ; GCN-NEXT:    v_perm_b32 v60, v158, v156, s8
+  ; GCN-NEXT:    v_fma_f32 v158, s4, v37, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v161, v61
+  ; GCN-NEXT:    v_perm_b32 v140, v159, v157, s8
   ; GCN-NEXT:    v_perm_b32 v37, v130, v128, s5
-  ; GCN-NEXT:    v_perm_b32 v143, v130, v128, s8
+  ; GCN-NEXT:    v_perm_b32 v61, v130, v128, s8
+  ; GCN-NEXT:    v_perm_b32 v141, v131, v129, s8
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b64 v135, v[36:37]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[62:63], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
+  ; GCN-NEXT:    v_perm_b32 v32, v159, v157, s5
   ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v150
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v150, v151
-  ; GCN-NEXT:    v_perm_b32 v32, v163, v161, s5
-  ; GCN-NEXT:    v_exp_f32_e32 v161, v33
+  ; GCN-NEXT:    v_fma_f32 v157, s4, v38, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v153
+  ; GCN-NEXT:    v_exp_f32_e32 v159, v33
   ; GCN-NEXT:    v_perm_b32 v33, v131, v129, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v129, v150, v38
+  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v152
+  ; GCN-NEXT:    v_exp_f32_e32 v152, v38
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[142:143]
+  ; GCN-NEXT:    ds_write_b64 v136, v[60:61]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v137, v[32:33]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[58:59], v[62:63], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v59, v150, v38
-  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v57
-  ; GCN-NEXT:    v_pack_b32_f16 v58, v149, v155
-  ; GCN-NEXT:    v_exp_f32_e32 v149, v38
   ; GCN-NEXT:    ; implicit-def: $vgpr33
   ; GCN-NEXT:    ; implicit-def: $vgpr38
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v138, v[146:147]
+  ; GCN-NEXT:    ds_write_b64 v138, v[140:141]
   ; GCN-NEXT:    v_add_u32_e32 v38, v132, v38
   ; GCN-NEXT:    v_add_u32_e32 v33, v132, v33
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[62:63], v38, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[142:143], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; implicit-def: $vgpr36
   ; GCN-NEXT:    v_add_u32_e32 v33, v132, v36
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[144:145], v[58:59], v[64:79]
   ; GCN-NEXT:    ; implicit-def: $vgpr37
   ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_add_u32_e32 v33, v132, v37
-  ; GCN-NEXT:    buffer_load_dwordx2 v[146:147], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v152
-  ; GCN-NEXT:    v_exp_f32_e32 v150, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v156, v162
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v155
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[158:159], v[58:59], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v160
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v154
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v152, v157
-  ; GCN-NEXT:    v_fma_f32 v57, s4, v39, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v165
+  ; GCN-NEXT:    v_pack_b32_f16 v128, v154, v156
+  ; GCN-NEXT:    v_fma_f32 v150, s4, v39, -v134
   ; GCN-NEXT:    ds_read_b128 v[36:39], v139
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[128:131], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[58:59], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v161
-  ; GCN-NEXT:    v_exp_f32_e32 v159, v32
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v149
-  ; GCN-NEXT:    v_fma_f32 v155, s4, v41, -v134
-  ; GCN-NEXT:    v_fma_f32 v158, s4, v42, -v134
-  ; GCN-NEXT:    v_fma_f32 v162, s4, v20, -v134
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[60:61], v[58:59], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v59, v34, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v57
-  ; GCN-NEXT:    v_pack_b32_f16 v58, v33, v152
-  ; GCN-NEXT:    v_exp_f32_e32 v60, v32
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v154, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
+  ; GCN-NEXT:    ds_read_b128 v[60:63], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_fma_f32 v156, s4, v42, -v134
+  ; GCN-NEXT:    v_perm_b32 v20, v140, v130, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v155, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v157
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v142, v161
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v41, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v159
+  ; GCN-NEXT:    v_exp_f32_e32 v157, v32
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v152
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v129, v34, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v150
+  ; GCN-NEXT:    v_pack_b32_f16 v128, v33, v142
+  ; GCN-NEXT:    v_exp_f32_e32 v146, v32
   ; GCN-NEXT:    ds_read_b128 v[32:35], v139 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v57, s4, v43, -v134
-  ; GCN-NEXT:    v_perm_b32 v20, v142, v62, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[58:59], v[64:79]
+  ; GCN-NEXT:    v_fma_f32 v142, s4, v43, -v134
+  ; GCN-NEXT:    v_fma_f32 v150, s4, v46, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v40
   ; GCN-NEXT:    ds_read_b128 v[40:43], v139 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v61, v36
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v155
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v150
-  ; GCN-NEXT:    v_fma_f32 v155, s4, v46, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[128:129], v[58:59], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v152, v36
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v128, v156
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v57
-  ; GCN-NEXT:    v_fma_f32 v129, s4, v45, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[58:59], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v159
-  ; GCN-NEXT:    v_exp_f32_e32 v158, v32
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v60
+  ; GCN-NEXT:    v_exp_f32_e32 v147, v36
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v143
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v154
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v143, v36
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v155
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v142
+  ; GCN-NEXT:    v_fma_f32 v61, s4, v45, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v157
+  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v146
   ; GCN-NEXT:    v_pack_b32_f16 v33, v33, v32
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v128
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[58:59], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v57, v36
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v60
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v129, v36
   ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v44
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v59, v61
-  ; GCN-NEXT:    v_fma_f32 v58, s4, v47, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v147
+  ; GCN-NEXT:    v_fma_f32 v128, s4, v47, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v140
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v57
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v128, v40
-  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v129
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v129, v152
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v140 offset:576
+  ; GCN-NEXT:    v_exp_f32_e32 v142, v40
+  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v61
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v143
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v57 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[130:131], v[32:33], v[80:95]
-  ; GCN-NEXT:    v_fma_f32 v130, s4, v17, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v155
-  ; GCN-NEXT:    v_exp_f32_e32 v131, v40
-  ; GCN-NEXT:    v_pack_b32_f16 v40, v59, v129
-  ; GCN-NEXT:    v_fma_f32 v155, s4, v18, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v59, v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
+  ; GCN-NEXT:    v_fma_f32 v62, s4, v17, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
+  ; GCN-NEXT:    v_exp_f32_e32 v63, v40
+  ; GCN-NEXT:    v_pack_b32_f16 v40, v60, v61
+  ; GCN-NEXT:    v_fma_f32 v150, s4, v18, -v134
+  ; GCN-NEXT:    v_fma_f32 v60, s4, v19, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v142
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v158
-  ; GCN-NEXT:    v_exp_f32_e32 v160, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v57
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v156
+  ; GCN-NEXT:    v_exp_f32_e32 v158, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v129
   ; GCN-NEXT:    v_pack_b32_f16 v41, v34, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v58
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v128
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
-  ; GCN-NEXT:    v_fma_f32 v58, s4, v19, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v129, v17
-  ; GCN-NEXT:    v_perm_b32 v42, v143, v63, s8
-  ; GCN-NEXT:    v_perm_b32 v43, v147, v145, s8
+  ; GCN-NEXT:    v_exp_f32_e32 v128, v17
+  ; GCN-NEXT:    v_perm_b32 v42, v141, v131, s8
+  ; GCN-NEXT:    v_perm_b32 v43, v149, v145, s8
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v16
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v140 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v140 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v130
-  ; GCN-NEXT:    v_exp_f32_e32 v163, v36
-  ; GCN-NEXT:    v_perm_b32 v36, v142, v62, s8
+  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v62
+  ; GCN-NEXT:    v_exp_f32_e32 v167, v36
+  ; GCN-NEXT:    v_perm_b32 v36, v140, v130, s8
   ; GCN-NEXT:    v_fma_f32 v62, s4, v21, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
   ; GCN-NEXT:    v_exp_f32_e32 v130, v37
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v160
-  ; GCN-NEXT:    v_perm_b32 v21, v146, v144, s5
-  ; GCN-NEXT:    v_perm_b32 v37, v146, v144, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v131
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v158
+  ; GCN-NEXT:    v_perm_b32 v21, v148, v144, s5
+  ; GCN-NEXT:    v_perm_b32 v37, v148, v144, s8
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b64 v135, v[20:21]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v16, v143, v63, s5
-  ; GCN-NEXT:    v_fma_f32 v63, s4, v22, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v129
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v155
-  ; GCN-NEXT:    v_exp_f32_e32 v142, v17
-  ; GCN-NEXT:    v_perm_b32 v17, v147, v145, s5
+  ; GCN-NEXT:    v_perm_b32 v16, v141, v131, s5
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v22, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v128
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
+  ; GCN-NEXT:    v_exp_f32_e32 v140, v17
+  ; GCN-NEXT:    v_perm_b32 v17, v149, v145, s5
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v136, v[36:37]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
   ; GCN-NEXT:    v_pack_b32_f16 v33, v45, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v58
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v60
   ; GCN-NEXT:    v_exp_f32_e32 v144, v22
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -862,22 +836,22 @@
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_add_u32_e32 v20, v132, v20
   ; GCN-NEXT:    v_add_u32_e32 v21, v132, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v59, v44
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44
   ; GCN-NEXT:    buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[58:59], v21, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v162
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v166
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
   ; GCN-NEXT:    v_exp_f32_e32 v132, v16
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v62
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v163
-  ; GCN-NEXT:    v_fma_f32 v143, s4, v23, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v167
+  ; GCN-NEXT:    v_fma_f32 v141, s4, v23, -v134
   ; GCN-NEXT:    ds_read_b128 v[20:23], v139
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -886,20 +860,20 @@
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
   ; GCN-NEXT:    v_exp_f32_e32 v62, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v63
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v46, v130
   ; GCN-NEXT:    v_fma_f32 v47, s4, v25, -v134
-  ; GCN-NEXT:    v_fma_f32 v63, s4, v26, -v134
-  ; GCN-NEXT:    v_fma_f32 v147, s4, v4, -v134
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v26, -v134
+  ; GCN-NEXT:    v_fma_f32 v149, s4, v4, -v134
   ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    v_perm_b32 v4, v42, v40, s5
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v142
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v140
   ; GCN-NEXT:    v_exp_f32_e32 v145, v16
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v144
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
   ; GCN-NEXT:    v_pack_b32_f16 v33, v18, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v143
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v141
   ; GCN-NEXT:    v_pack_b32_f16 v32, v17, v46
   ; GCN-NEXT:    v_exp_f32_e32 v35, v16
   ; GCN-NEXT:    ds_read_b128 v[16:19], v139 offset:1152
@@ -921,11 +895,11 @@
   ; GCN-NEXT:    v_fma_f32 v37, s4, v29, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v46
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v63
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v145
-  ; GCN-NEXT:    v_exp_f32_e32 v143, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v141, v16
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v35
-  ; GCN-NEXT:    v_fma_f32 v63, s4, v30, -v134
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v30, -v134
   ; GCN-NEXT:    v_pack_b32_f16 v17, v17, v16
   ; GCN-NEXT:    v_pack_b32_f16 v16, v21, v36
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
@@ -933,25 +907,25 @@
   ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v28
   ; GCN-NEXT:    v_fma_f32 v32, s4, v31, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[20:23], v140
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v57
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_exp_f32_e32 v36, v24
   ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v37
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v47
-  ; GCN-NEXT:    ds_read_b128 v[28:31], v140 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[28:31], v57 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
   ; GCN-NEXT:    v_fma_f32 v38, s4, v1, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v63
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
   ; GCN-NEXT:    v_exp_f32_e32 v39, v24
   ; GCN-NEXT:    v_pack_b32_f16 v24, v34, v37
-  ; GCN-NEXT:    v_fma_f32 v63, s4, v2, -v134
+  ; GCN-NEXT:    v_fma_f32 v131, s4, v2, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v36
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v143
-  ; GCN-NEXT:    v_exp_f32_e32 v146, v1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v141
+  ; GCN-NEXT:    v_exp_f32_e32 v148, v1
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v33
   ; GCN-NEXT:    v_pack_b32_f16 v25, v18, v1
   ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v32
@@ -959,25 +933,25 @@
   ; GCN-NEXT:    v_fma_f32 v32, s4, v3, -v134
   ; GCN-NEXT:    v_exp_f32_e32 v34, v1
   ; GCN-NEXT:    v_perm_b32 v26, v43, v41, s8
-  ; GCN-NEXT:    v_perm_b32 v27, v59, v45, s8
+  ; GCN-NEXT:    v_perm_b32 v27, v61, v45, s8
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v140 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v140 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v38
-  ; GCN-NEXT:    v_exp_f32_e32 v155, v20
+  ; GCN-NEXT:    v_exp_f32_e32 v150, v20
   ; GCN-NEXT:    v_perm_b32 v20, v42, v40, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v146
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v148
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
   ; GCN-NEXT:    v_exp_f32_e32 v38, v21
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v39
   ; GCN-NEXT:    v_fma_f32 v29, s4, v5, -v134
-  ; GCN-NEXT:    v_perm_b32 v5, v58, v44, s5
-  ; GCN-NEXT:    v_perm_b32 v21, v58, v44, s8
+  ; GCN-NEXT:    v_perm_b32 v5, v60, v44, s5
+  ; GCN-NEXT:    v_perm_b32 v21, v60, v44, s8
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
@@ -987,9 +961,9 @@
   ; GCN-NEXT:    v_perm_b32 v0, v43, v41, s5
   ; GCN-NEXT:    v_fma_f32 v41, s4, v6, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v34
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v63
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
   ; GCN-NEXT:    v_exp_f32_e32 v42, v1
-  ; GCN-NEXT:    v_perm_b32 v1, v59, v45, s5
+  ; GCN-NEXT:    v_perm_b32 v1, v61, v45, s5
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v136, v[20:21]
@@ -1013,10 +987,10 @@
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v147
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149
   ; GCN-NEXT:    v_exp_f32_e32 v26, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v29
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v155
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v150
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v38
   ; GCN-NEXT:    ds_read_b128 v[20:23], v139 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1068,10 +1042,10 @@
   ; GCN-NEXT:    v_exp_f32_e32 v21, v9
   ; GCN-NEXT:    v_fma_f32 v8, s4, v15, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v140
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v57
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[12:15], v140 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[12:15], v57 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24
@@ -1097,33 +1071,33 @@
   ; GCN-NEXT:    v_add_f32_e32 v3, v54, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v55, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v56, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v165, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v168, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v170, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v58, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v164, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v59, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v162, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v151, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v154, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v157, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v165, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v161, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v149, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v156, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v159, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v60, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v61, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v152, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v154, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v157, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v147, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v156, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v63, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v158, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v57, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v128, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v131, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v130, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v140, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v144, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v132, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v62, v3
@@ -1131,14 +1105,14 @@
   ; GCN-NEXT:    v_add_f32_e32 v3, v35, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v46, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v47, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v33, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v36, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v39, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
   ; GCN-NEXT:    v_add_f32_e32 v3, v34, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2
   ; GCN-NEXT:    v_add_f32_e32 v3, v38, v3
@@ -1163,18 +1137,17 @@
   ; GCN-NEXT:    v_add_f32_e32 v4, v10, v0
   ; GCN-NEXT:    ds_bpermute_b32 v5, v133, v4
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v140 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_add_f32_e32 v2, v4, v5
   ; GCN-NEXT:    ds_bpermute_b32 v3, v133, v2
-  ; GCN-NEXT:    ; implicit-def: $vgpr4
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111]
-  ; GCN-NEXT:    v_mov_b32_e32 v0, v4
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v2, s[6:7]
-  ; GCN-NEXT:    v_fmac_f32_e32 v1, v0, v48
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v140 offset:1728
+  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[6:7]
+  ; GCN-NEXT:    ; implicit-def: $vgpr4
+  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v48
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index be97a1e82fcf2..0887fdf0844b0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -10,24 +10,25 @@
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
   ; GCN-NEXT:    v_readfirstlane_b32 s20, v2
   ; GCN-NEXT:    ; implicit-def: $sgpr4
-  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    ; implicit-def: $vgpr3
   ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:    ; implicit-def: $vgpr76
+  ; GCN-NEXT:    ; implicit-def: $vgpr50
   ; GCN-NEXT:    ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; implicit-def: $vgpr49
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
-  ; GCN-NEXT:    ; implicit-def: $vgpr50
+  ; GCN-NEXT:    ; implicit-def: $vgpr51
+  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
+  ; GCN-NEXT:    ; implicit-def: $vgpr76
   ; GCN-NEXT:    ; implicit-def: $vgpr77
   ; GCN-NEXT:    ; implicit-def: $vgpr78
   ; GCN-NEXT:    ; implicit-def: $vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr80
-  ; GCN-NEXT:    ; implicit-def: $vgpr81
-  ; GCN-NEXT:    ; implicit-def: $vgpr103
+  ; GCN-NEXT:    ; implicit-def: $vgpr91
   ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v64
+  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v3
   ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
   ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -35,9 +36,8 @@
   ; GCN-NEXT:    s_lshl_b32 s4, s20, 7
   ; GCN-NEXT:    ; implicit-def: $vgpr5
   ; GCN-NEXT:    v_add_lshl_u32 v48, v5, s4, 1
-  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
-  ; GCN-NEXT:    v_add_u32_e32 v77, s20, v77
-  ; GCN-NEXT:    v_and_b32_e32 v77, 0x1fffffff, v77
+  ; GCN-NEXT:    v_add_u32_e32 v76, s20, v76
+  ; GCN-NEXT:    v_and_b32_e32 v76, 0x1fffffff, v76
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b128 v48, v[0:3]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -48,8 +48,8 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   ; GCN-NEXT:    ; implicit-def: $sgpr6
-  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v76
-  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v76
+  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v50
+  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v50
   ; GCN-NEXT:    buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -68,22 +68,22 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
   ; GCN-NEXT:    ; kill: killed $vgpr1
   ; GCN-NEXT:    ; kill: killed $vgpr0
-  ; GCN-NEXT:    v_mul_lo_u32 v77, v77, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v77, v78, v77, 1
-  ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
+  ; GCN-NEXT:    v_mul_lo_u32 v76, v76, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v76, v77, v76, 1
+  ; GCN-NEXT:    v_lshl_add_u32 v77, v78, 1, v76
   ; GCN-NEXT:    ; implicit-def: $sgpr5
-  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
+  ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
   ; GCN-NEXT:    ; implicit-def: $sgpr2
   ; GCN-NEXT:    ; implicit-def: $sgpr3
-  ; GCN-NEXT:    v_lshl_add_u32 v80, v81, 1, v79
+  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v50
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v51
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v50 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v51 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
@@ -107,20 +107,20 @@
   ; GCN-NEXT:    ds_read_b128 v[40:43], v49 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v50
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v51
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
   ; GCN-NEXT:    ; implicit-def: $vgpr32
   ; GCN-NEXT:    ; implicit-def: $vgpr33
-  ; GCN-NEXT:    v_add_u32_e32 v83, v32, v76
-  ; GCN-NEXT:    v_add_u32_e32 v76, v33, v76
+  ; GCN-NEXT:    v_add_u32_e32 v82, v32, v50
+  ; GCN-NEXT:    v_add_u32_e32 v83, v33, v50
+  ; GCN-NEXT:    ; kill: killed $vgpr82
   ; GCN-NEXT:    ; kill: killed $vgpr83
-  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[66:69], v50 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[66:69], v51 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
@@ -131,20 +131,20 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
   ; GCN-NEXT:    ; implicit-def: $vgpr66
   ; GCN-NEXT:    ; implicit-def: $vgpr67
-  ; GCN-NEXT:    v_max_f32_e32 v82, v67, v67
+  ; GCN-NEXT:    v_max_f32_e32 v81, v67, v67
   ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
   ; GCN-NEXT:    v_perm_b32 v70, v74, v72, s2
   ; GCN-NEXT:    v_perm_b32 v71, v74, v72, s3
   ; GCN-NEXT:    v_perm_b32 v72, v75, v73, s2
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v77, v70
+  ; GCN-NEXT:    ds_write_b32 v76, v70
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v71
+  ; GCN-NEXT:    ds_write_b32 v77, v71
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v72
+  ; GCN-NEXT:    ds_write_b32 v78, v72
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v20
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
   ; GCN-NEXT:    v_mul_f32_e32 v64, s4, v16
@@ -152,11 +152,11 @@
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
   ; GCN-NEXT:    v_max3_f32 v64, v64, s5, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v21
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v22
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v23
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v24
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v25
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
@@ -166,12 +166,12 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v28
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v29
   ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v30
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v31
   ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v0
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v1
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v81, v84
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v80, v84
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v2
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v3
   ; GCN-NEXT:    v_max3_f32 v64, v64, v85, v86
@@ -179,315 +179,315 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v5
   ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v65
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v7
+  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v7
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v8
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v9
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v10
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v11
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v12
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v13
   ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v65
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v68
-  ; GCN-NEXT:    v_perm_b32 v68, v75, v73, s3
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v68
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
+  ; GCN-NEXT:    v_perm_b32 v68, v75, v73, s3
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v80, v68
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
+  ; GCN-NEXT:    ds_write_b32 v79, v68
+  ; GCN-NEXT:    ; implicit-def: $vgpr84
+  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v70, v64, v65
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[70:71], v76, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
+  ; GCN-NEXT:    ds_bpermute_b32 v71, v66, v70
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ; implicit-def: $vgpr87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
-  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[0:1]
-  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
-  ; GCN-NEXT:    v_max_f32_e32 v65, v82, v64
-  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v65
-  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v65
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v65
-  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v65
+  ; GCN-NEXT:    v_cndmask_b32_e64 v70, v71, v70, s[0:1]
+  ; GCN-NEXT:    v_max_f32_e32 v70, v70, v70
+  ; GCN-NEXT:    v_max_f32_e32 v72, v81, v70
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v72
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v72
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v72
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v65
-  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v65
-  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v65
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v72, v16
-  ; GCN-NEXT:    v_exp_f32_e32 v73, v17
-  ; GCN-NEXT:    v_exp_f32_e32 v81, v18
-  ; GCN-NEXT:    v_exp_f32_e32 v82, v19
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v72
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v72
+  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v72
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v19
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
   ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v72
-  ; GCN-NEXT:    v_fma_f32 v17, s4, v24, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v83, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v73
-  ; GCN-NEXT:    v_fma_f32 v19, s4, v25, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v84, v21
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v81
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v85, v22
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v82
-  ; GCN-NEXT:    v_pack_b32_f16 v24, v16, v18
-  ; GCN-NEXT:    v_sub_f32_e32 v22, v67, v65
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v73
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v74
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v22
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v75
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_pack_b32_f16 v25, v20, v21
-  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v19
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v87
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
+  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_sub_f32_e32 v24, v67, v72
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v23
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v85, v22
+  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
+  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
+  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
+  ; GCN-NEXT:    ds_read_b128 v[18:21], v84
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v86, v23
-  ; GCN-NEXT:    v_exp_f32_e32 v64, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[16:17], v[24:25], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v16, 0, v72
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v83
-  ; GCN-NEXT:    v_fma_f32 v88, s4, v28, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v89, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v84
-  ; GCN-NEXT:    v_fma_f32 v91, s4, v29, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v92, v21
-  ; GCN-NEXT:    ds_read_b128 v[20:23], v87 offset:576
+  ; GCN-NEXT:    v_exp_f32_e32 v16, v24
+  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[64:65] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_perm_b32 v99, v70, v68, s2
-  ; GCN-NEXT:    v_perm_b32 v100, v70, v68, s3
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[24:25], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v93, v73, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v94, v85
-  ; GCN-NEXT:    v_fma_f32 v95, s4, v30, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v96, v16
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v97, v86
-  ; GCN-NEXT:    v_fma_f32 v98, s4, v31, -v65
-  ; GCN-NEXT:    v_perm_b32 v101, v71, v69, s2
-  ; GCN-NEXT:    v_perm_b32 v102, v71, v69, s3
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
+  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
+  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
+  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v17, v18
+  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v30, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v30, v18
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v31, -v72
+  ; GCN-NEXT:    v_perm_b32 v31, v68, v64, s2
+  ; GCN-NEXT:    v_perm_b32 v64, v68, v64, s3
+  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
+  ; GCN-NEXT:    ds_read_b128 v[26:29], v91
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v91 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v77, v99
-  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
-  ; GCN-NEXT:    v_pack_b32_f16 v76, v76, v90
-  ; GCN-NEXT:    v_pack_b32_f16 v77, v94, v97
+  ; GCN-NEXT:    ds_write_b32 v76, v31
+  ; GCN-NEXT:    v_mul_f32_e32 v31, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_exp_f32_e32 v31, v31
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v18
+  ; GCN-NEXT:    v_pack_b32_f16 v18, v19, v86
+  ; GCN-NEXT:    v_pack_b32_f16 v19, v22, v89
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v100
+  ; GCN-NEXT:    ds_write_b32 v77, v64
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v101
-  ; GCN-NEXT:    v_mul_f32_e32 v78, 0x3fb8aa3b, v88
-  ; GCN-NEXT:    v_mul_f32_e32 v79, 0x3fb8aa3b, v91
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v81, v81, v93
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v89
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v91, v78
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v78, v92
-  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v93, v79
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[76:77], v[32:47]
+  ; GCN-NEXT:    ds_write_b32 v78, v90
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v80, v102
-  ; GCN-NEXT:    v_mul_f32_e32 v80, 0x3fb8aa3b, v95
-  ; GCN-NEXT:    v_add_f32_e32 v76, v82, v81
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v96
-  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v80, v80
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v79, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v98
-  ; GCN-NEXT:    v_fma_f32 v81, s4, v3, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v82, v88
+  ; GCN-NEXT:    ds_write_b32 v79, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v73
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v87
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v74, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v85
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v22, v64
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v88
+  ; GCN-NEXT:    v_exp_f32_e32 v64, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v75, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v30
+  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v23, v23
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v31
   ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v90, v78
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v77, v79
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v20, v21
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v18, v19
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v80, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v22
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v4, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v27, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v64
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v5, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
   ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+  ; GCN-NEXT:    v_add_f32_e32 v17, v81, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v23
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v68, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v25
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    ; implicit-def: $sgpr2
-  ; GCN-NEXT:    s_nop 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v68, v83, v76
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v91
-  ; GCN-NEXT:    v_fma_f32 v83, s4, v4, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v90, v3
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v93
-  ; GCN-NEXT:    v_fma_f32 v94, s4, v5, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v88, v88
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v68, v84, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v80
-  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v72, v2
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v82
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v69, v4
-  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v81
+  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v87
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v84
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v73
-  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v73, v69
-  ; GCN-NEXT:    ds_read_b128 v[76:79], v87 offset:576
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v18, v4
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v19
+  ; GCN-NEXT:    v_exp_f32_e32 v24, v24
+  ; GCN-NEXT:    ds_read_b128 v[18:21], v84 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v83
-  ; GCN-NEXT:    v_mul_f32_e32 v81, 0x3fb8aa3b, v94
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v68, v85, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v70, v90
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v71, v69
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v88
-  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v81, v81
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v26, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v82, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v27
+  ; GCN-NEXT:    v_exp_f32_e32 v26, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v65
+  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
   ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
-  ; GCN-NEXT:    v_add_f32_e32 v68, v86, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v72
-  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v74, v6
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v73
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v17, v83, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v68
+  ; GCN-NEXT:    v_exp_f32_e32 v6, v6
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v24
   ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v7
-  ; GCN-NEXT:    v_fma_f32 v75, s4, v11, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v83, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v70, v69
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v8
-  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v9
+  ; GCN-NEXT:    v_exp_f32_e32 v7, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v28, v29
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v69
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    s_nop 1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v89, v68
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v71
-  ; GCN-NEXT:    v_fma_f32 v70, s4, v12, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v84, v7
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v81
-  ; GCN-NEXT:    v_fma_f32 v86, s4, v13, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v87, v8
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v76, v92, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v85, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v67
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v4, v88, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v74
-  ; GCN-NEXT:    v_fma_f32 v77, s4, v14, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v89, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v92, v83
-  ; GCN-NEXT:    v_pack_b32_f16 v68, v68, v85
-  ; GCN-NEXT:    v_mul_f32_e32 v75, 0x3fb8aa3b, v75
-  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v70
-  ; GCN-NEXT:    v_pack_b32_f16 v69, v69, v92
-  ; GCN-NEXT:    v_fma_f32 v65, s4, v15, -v65
-  ; GCN-NEXT:    v_exp_f32_e32 v75, v75
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[68:69], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v76, v96, v76
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v84
-  ; GCN-NEXT:    v_exp_f32_e32 v92, v70
-  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v86
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v87
-  ; GCN-NEXT:    v_exp_f32_e32 v94, v70
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v65
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[68:69], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v67, v67, v76
-  ; GCN-NEXT:    v_add_f32_e32 v67, v91, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v93, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v80, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v82, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v90, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v88, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v72, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v68, 0x3fb8aa3b, v77
-  ; GCN-NEXT:    v_add_f32_e32 v67, v73, v67
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v89
-  ; GCN-NEXT:    v_exp_f32_e32 v78, v68
-  ; GCN-NEXT:    v_add_f32_e32 v67, v71, v67
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v6
+  ; GCN-NEXT:    v_exp_f32_e32 v10, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v17, v28
+  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v2, v30, v4
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v0, v31, v2
+  ; GCN-NEXT:    v_add_f32_e32 v0, v22, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v64, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v23, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v25, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v27, v0
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
+  ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_add_f32_e32 v0, v68, v0
+  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
+  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
+  ; GCN-NEXT:    v_add_f32_e32 v0, v24, v0
+  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
+  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
+  ; GCN-NEXT:    v_add_f32_e32 v0, v26, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v67, v0
+  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
+  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v5
+  ; GCN-NEXT:    v_add_f32_e32 v0, v6, v0
+  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
+  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
+  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
+  ; GCN-NEXT:    v_exp_f32_e32 v17, v1
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v14
+  ; GCN-NEXT:    v_add_f32_e32 v0, v7, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
+  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
+  ; GCN-NEXT:    v_exp_f32_e32 v18, v1
+  ; GCN-NEXT:    v_add_f32_e32 v6, v8, v0
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v91
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v75
-  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
-  ; GCN-NEXT:    v_add_f32_e32 v67, v81, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v74, v67
-  ; GCN-NEXT:    v_pack_b32_f16 v77, v76, v77
-  ; GCN-NEXT:    v_pack_b32_f16 v76, v85, v86
-  ; GCN-NEXT:    v_add_f32_e32 v67, v83, v67
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v72, v65
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v94
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[76:77], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v78
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v74, v92
-  ; GCN-NEXT:    v_add_f32_e32 v67, v84, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v87, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v89, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v75, v67
-  ; GCN-NEXT:    v_pack_b32_f16 v69, v68, v72
-  ; GCN-NEXT:    v_pack_b32_f16 v68, v74, v73
-  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
+  ; GCN-NEXT:    v_add_f32_e32 v6, v9, v6
+  ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v13
+  ; GCN-NEXT:    v_add_f32_e32 v6, v10, v6
+  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v18
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v15
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v12
+  ; GCN-NEXT:    v_add_f32_e32 v6, v11, v6
+  ; GCN-NEXT:    v_add_f32_e32 v6, v12, v6
+  ; GCN-NEXT:    v_add_f32_e32 v1, v15, v6
+  ; GCN-NEXT:    v_add_f32_e32 v11, v17, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v0, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v4, v10
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v91 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_f32_e32 v67, v92, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v94, v67
-  ; GCN-NEXT:    v_add_f32_e32 v67, v78, v67
-  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
-  ; GCN-NEXT:    ds_bpermute_b32 v67, v66, v65
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
-  ; GCN-NEXT:    ds_bpermute_b32 v66, v66, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mov_b32_e32 v67, 0
+  ; GCN-NEXT:    v_mov_b32_e32 v4, 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v2, v18, v11
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
+  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v65, v66, v65, s[0:1]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[68:69], v[48:63]
-  ; GCN-NEXT:    v_fmac_f32_e32 v65, v67, v64
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[68:69], v[32:47]
+  ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+  ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
   ; GCN-NEXT:    s_endpgm
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index c48f3ee00130a..5ab8706f28f5f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -427,37 +427,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
 ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 4
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -647,10 +647,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -665,19 +665,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1298,26 +1298,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
@@ -1326,26 +1326,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0)
@@ -1627,26 +1627,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v18, s2
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v19, s3
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
@@ -1655,26 +1655,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
@@ -1741,26 +1741,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v18, s2
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v19, s3
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
@@ -1769,26 +1769,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index e11050ccce746..033a35f69a0bd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -269,27 +269,28 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; GCN-NEXT:    v_mov_b32_e32 v42, s22
 ; GCN-NEXT:    v_mov_b32_e32 v43, s23
-; GCN-NEXT:    v_mov_b32_e32 v32, s16
-; GCN-NEXT:    v_mov_b32_e32 v33, s17
-; GCN-NEXT:    v_mov_b32_e32 v34, s18
-; GCN-NEXT:    v_mov_b32_e32 v35, s19
 ; GCN-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_mov_b32_e32 v16, s16
+; GCN-NEXT:    v_mov_b32_e32 v17, s17
+; GCN-NEXT:    v_mov_b32_e32 v18, s18
+; GCN-NEXT:    v_mov_b32_e32 v19, s19
+; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v32, s12
-; GCN-NEXT:    v_mov_b32_e32 v33, s13
-; GCN-NEXT:    v_mov_b32_e32 v34, s14
-; GCN-NEXT:    v_mov_b32_e32 v35, s15
-; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v16, s12
+; GCN-NEXT:    v_mov_b32_e32 v17, s13
+; GCN-NEXT:    v_mov_b32_e32 v18, s14
+; GCN-NEXT:    v_mov_b32_e32 v19, s15
+; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v32, s8
-; GCN-NEXT:    v_mov_b32_e32 v33, s9
-; GCN-NEXT:    v_mov_b32_e32 v34, s10
-; GCN-NEXT:    v_mov_b32_e32 v35, s11
-; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v16, s8
+; GCN-NEXT:    v_mov_b32_e32 v17, s9
+; GCN-NEXT:    v_mov_b32_e32 v18, s10
+; GCN-NEXT:    v_mov_b32_e32 v19, s11
+; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -331,27 +332,28 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v42, s22
 ; GCN-NEXT:    v_mov_b32_e32 v43, s23
-; GCN-NEXT:    v_mov_b32_e32 v32, s16
-; GCN-NEXT:    v_mov_b32_e32 v33, s17
-; GCN-NEXT:    v_mov_b32_e32 v34, s18
-; GCN-NEXT:    v_mov_b32_e32 v35, s19
 ; GCN-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_mov_b32_e32 v16, s16
+; GCN-NEXT:    v_mov_b32_e32 v17, s17
+; GCN-NEXT:    v_mov_b32_e32 v18, s18
+; GCN-NEXT:    v_mov_b32_e32 v19, s19
+; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v32, s12
-; GCN-NEXT:    v_mov_b32_e32 v33, s13
-; GCN-NEXT:    v_mov_b32_e32 v34, s14
-; GCN-NEXT:    v_mov_b32_e32 v35, s15
-; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v16, s12
+; GCN-NEXT:    v_mov_b32_e32 v17, s13
+; GCN-NEXT:    v_mov_b32_e32 v18, s14
+; GCN-NEXT:    v_mov_b32_e32 v19, s15
+; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v32, s8
-; GCN-NEXT:    v_mov_b32_e32 v33, s9
-; GCN-NEXT:    v_mov_b32_e32 v34, s10
-; GCN-NEXT:    v_mov_b32_e32 v35, s11
-; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v16, s8
+; GCN-NEXT:    v_mov_b32_e32 v17, s9
+; GCN-NEXT:    v_mov_b32_e32 v18, s10
+; GCN-NEXT:    v_mov_b32_e32 v19, s11
+; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index ebab4891d7da6..753206206180a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1508,27 +1508,28 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; SDAG-NEXT:    v_mov_b32_e32 v42, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v43, s23
-; SDAG-NEXT:    v_mov_b32_e32 v32, s16
-; SDAG-NEXT:    v_mov_b32_e32 v33, s17
-; SDAG-NEXT:    v_mov_b32_e32 v34, s18
-; SDAG-NEXT:    v_mov_b32_e32 v35, s19
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s12
-; SDAG-NEXT:    v_mov_b32_e32 v33, s13
-; SDAG-NEXT:    v_mov_b32_e32 v34, s14
-; SDAG-NEXT:    v_mov_b32_e32 v35, s15
-; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s8
-; SDAG-NEXT:    v_mov_b32_e32 v33, s9
-; SDAG-NEXT:    v_mov_b32_e32 v34, s10
-; SDAG-NEXT:    v_mov_b32_e32 v35, s11
-; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1610,27 +1611,28 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; HEURRC-NEXT:    v_mov_b32_e32 v42, s22
 ; HEURRC-NEXT:    v_mov_b32_e32 v43, s23
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -1666,27 +1668,28 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; VGPRRC-NEXT:    v_mov_b32_e32 v42, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v43, s23
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -1847,27 +1850,28 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; SDAG-NEXT:    v_mov_b32_e32 v42, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v43, s23
-; SDAG-NEXT:    v_mov_b32_e32 v32, s16
-; SDAG-NEXT:    v_mov_b32_e32 v33, s17
-; SDAG-NEXT:    v_mov_b32_e32 v34, s18
-; SDAG-NEXT:    v_mov_b32_e32 v35, s19
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    s_nop 2
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s12
-; SDAG-NEXT:    v_mov_b32_e32 v33, s13
-; SDAG-NEXT:    v_mov_b32_e32 v34, s14
-; SDAG-NEXT:    v_mov_b32_e32 v35, s15
-; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s8
-; SDAG-NEXT:    v_mov_b32_e32 v33, s9
-; SDAG-NEXT:    v_mov_b32_e32 v34, s10
-; SDAG-NEXT:    v_mov_b32_e32 v35, s11
-; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1949,27 +1953,28 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; HEURRC-NEXT:    v_mov_b32_e32 v42, s22
 ; HEURRC-NEXT:    v_mov_b32_e32 v43, s23
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    s_nop 2
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -2005,27 +2010,28 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; VGPRRC-NEXT:    v_mov_b32_e32 v42, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v43, s23
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    s_nop 2
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -3185,9 +3191,13 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
+; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
+; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3208,14 +3218,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3593,9 +3603,13 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
+; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    s_nop 0
+; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
+; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3616,14 +3630,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
@@ -4136,32 +4150,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s16
-; SDAG-NEXT:    v_mov_b32_e32 v33, s17
-; SDAG-NEXT:    v_mov_b32_e32 v34, s18
-; SDAG-NEXT:    v_mov_b32_e32 v35, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s12
-; SDAG-NEXT:    v_mov_b32_e32 v33, s13
-; SDAG-NEXT:    v_mov_b32_e32 v34, s14
-; SDAG-NEXT:    v_mov_b32_e32 v35, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s8
-; SDAG-NEXT:    v_mov_b32_e32 v33, s9
-; SDAG-NEXT:    v_mov_b32_e32 v34, s10
-; SDAG-NEXT:    v_mov_b32_e32 v35, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4245,32 +4260,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    s_nop 6
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4308,32 +4324,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    s_nop 6
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4510,32 +4527,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    s_nop 6
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s16
-; SDAG-NEXT:    v_mov_b32_e32 v33, s17
-; SDAG-NEXT:    v_mov_b32_e32 v34, s18
-; SDAG-NEXT:    v_mov_b32_e32 v35, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s12
-; SDAG-NEXT:    v_mov_b32_e32 v33, s13
-; SDAG-NEXT:    v_mov_b32_e32 v34, s14
-; SDAG-NEXT:    v_mov_b32_e32 v35, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s8
-; SDAG-NEXT:    v_mov_b32_e32 v33, s9
-; SDAG-NEXT:    v_mov_b32_e32 v34, s10
-; SDAG-NEXT:    v_mov_b32_e32 v35, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4619,32 +4637,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    s_nop 6
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4682,32 +4701,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    s_nop 6
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index d9f1b542e4cb4..7e30af96bb8b9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -799,17 +799,17 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1155,8 +1155,8 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2005,21 +2005,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -2395,21 +2395,21 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -3304,17 +3304,17 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
@@ -3494,19 +3494,19 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1)
 ;
 ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
 ; GFX942-VGPR:       ; %bb.0:
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x41
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3)
@@ -4309,7 +4309,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4318,9 +4318,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v6, v[0:3], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -5017,12 +5017,12 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v0, v1, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -5542,8 +5542,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v1
@@ -5572,37 +5570,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[62:63], v[30:31]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v64, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[60:61], v[28:29]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[58:59], v[26:27]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[56:57], v[24:25]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[54:55], v[22:23]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[52:53], v[20:21]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[50:51], v[18:19]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[48:49], v[16:17]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[46:47], v[14:15]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[44:45], v[12:13]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[42:43], v[10:11]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[40:41], v[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[38:39], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[36:37], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[34:35], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[30:31]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[30:31], v[28:29]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[28:29], v[26:27]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[26:27], v[24:25]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[24:25], v[22:23]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[22:23], v[20:21]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[20:21], v[18:19]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], v[16:17]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[32:63], v0, v64, v[32:63]
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[60:63], s[0:1] offset:112
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[56:59], s[0:1] offset:96
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[52:55], s[0:1] offset:80
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[48:51], s[0:1] offset:64
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[44:47], s[0:1] offset:48
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[40:43], s[0:1] offset:32
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[36:39], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[32:35], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
@@ -5695,20 +5695,20 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
+; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -5804,19 +5804,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index f4f1ca024b7d6..f0205a3a788ed 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -5101,35 +5101,35 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; SDAG-NEXT:    v_mov_b64_e32 v[20:21], 48
 ; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[38:39], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
-; SDAG-NEXT:    v_mov_b32_e32 v32, s16
-; SDAG-NEXT:    v_mov_b32_e32 v33, s17
-; SDAG-NEXT:    v_mov_b32_e32 v34, s18
-; SDAG-NEXT:    v_mov_b32_e32 v35, s19
-; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], 16
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s12
-; SDAG-NEXT:    v_mov_b32_e32 v33, s13
-; SDAG-NEXT:    v_mov_b32_e32 v34, s14
-; SDAG-NEXT:    v_mov_b32_e32 v35, s15
-; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s8
-; SDAG-NEXT:    v_mov_b32_e32 v33, s9
-; SDAG-NEXT:    v_mov_b32_e32 v34, s10
-; SDAG-NEXT:    v_mov_b32_e32 v35, s11
-; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5137,9 +5137,6 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
@@ -5157,33 +5154,28 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], 48
-; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[20:21]
-; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 3
-; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
@@ -5199,23 +5191,23 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-NEXT:    v_mov_b32_e32 v32, 42
 ; SDAG-NEXT:    v_mov_b32_e32 v33, 25
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s12
-; SDAG-NEXT:    v_mov_b32_e32 v1, s13
-; SDAG-NEXT:    v_mov_b32_e32 v2, s14
-; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; SDAG-NEXT:    v_mov_b32_e32 v5, s17
-; SDAG-NEXT:    v_mov_b32_e32 v6, s18
-; SDAG-NEXT:    v_mov_b32_e32 v7, s19
-; SDAG-NEXT:    v_mov_b32_e32 v8, s20
-; SDAG-NEXT:    v_mov_b32_e32 v9, s21
-; SDAG-NEXT:    v_mov_b32_e32 v10, s22
-; SDAG-NEXT:    v_mov_b32_e32 v11, s23
+; SDAG-NEXT:    v_mov_b32_e32 v16, s12
+; SDAG-NEXT:    v_mov_b32_e32 v17, s13
+; SDAG-NEXT:    v_mov_b32_e32 v18, s14
+; SDAG-NEXT:    v_mov_b32_e32 v19, s15
+; SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; SDAG-NEXT:    v_mov_b32_e32 v21, s17
+; SDAG-NEXT:    v_mov_b32_e32 v22, s18
+; SDAG-NEXT:    v_mov_b32_e32 v23, s19
+; SDAG-NEXT:    v_mov_b32_e32 v24, s20
+; SDAG-NEXT:    v_mov_b32_e32 v25, s21
+; SDAG-NEXT:    v_mov_b32_e32 v26, s22
+; SDAG-NEXT:    v_mov_b32_e32 v27, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
+; SDAG-NEXT:    v_mov_b32_e32 v28, s24
+; SDAG-NEXT:    v_mov_b32_e32 v29, s25
+; SDAG-NEXT:    v_mov_b32_e32 v30, s26
+; SDAG-NEXT:    v_mov_b32_e32 v31, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
@@ -5250,33 +5242,19 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s16
-; SDAG-NEXT:    v_mov_b32_e32 v33, s17
-; SDAG-NEXT:    v_mov_b32_e32 v34, s18
-; SDAG-NEXT:    v_mov_b32_e32 v35, s19
-; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
-; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s12
-; SDAG-NEXT:    v_mov_b32_e32 v33, s13
-; SDAG-NEXT:    v_mov_b32_e32 v34, s14
-; SDAG-NEXT:    v_mov_b32_e32 v35, s15
-; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
-; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v32, s8
-; SDAG-NEXT:    v_mov_b32_e32 v33, s9
-; SDAG-NEXT:    v_mov_b32_e32 v34, s10
-; SDAG-NEXT:    v_mov_b32_e32 v35, s11
-; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s9
+; SDAG-NEXT:    v_mov_b32_e32 v18, s10
+; SDAG-NEXT:    v_mov_b32_e32 v19, s11
+; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5287,9 +5265,6 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    v_mov_b32_e32 v32, 25
 ; GISEL-NEXT:    v_mov_b32_e32 v33, 42
 ; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[38:39]
@@ -5321,20 +5296,20 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 2
 ; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index ef3bb0cb5f4f1..5475fa2ae5c6e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-SDAG-NEXT:    s_nop 1
-; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    s_nop 6
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32_vgprcd:
@@ -87,14 +87,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, 4.0
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-GISEL-NEXT:    s_nop 1
-; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT:    s_nop 6
-; GFX942-GISEL-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index e7c8465b9fbe3..6eb9449069a52 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -440,13 +440,11 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v26, v10
-; SDAG-NEXT:    v_mov_b32_e32 v10, s24
-; SDAG-NEXT:    v_mov_b32_e32 v11, s25
-; SDAG-NEXT:    v_mov_b32_e32 v12, s26
-; SDAG-NEXT:    v_mov_b32_e32 v13, s27
-; SDAG-NEXT:    v_mov_b32_e32 v14, s28
-; SDAG-NEXT:    v_mov_b32_e32 v15, s29
+; SDAG-NEXT:    v_mov_b32_e32 v13, s25
+; SDAG-NEXT:    v_mov_b32_e32 v14, s26
+; SDAG-NEXT:    v_mov_b32_e32 v15, s27
+; SDAG-NEXT:    v_mov_b32_e32 v16, s28
+; SDAG-NEXT:    v_mov_b32_e32 v17, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -455,16 +453,17 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v16, v0
-; SDAG-NEXT:    v_mov_b32_e32 v17, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v9
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 11
@@ -784,13 +783,11 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-NEXT:    v_mov_b32_e32 v37, s1
 ; GCN-NEXT:    v_mov_b32_e32 v38, s2
 ; GCN-NEXT:    v_mov_b32_e32 v39, s3
-; GCN-NEXT:    v_mov_b32_e32 v26, v10
-; GCN-NEXT:    v_mov_b32_e32 v10, s24
-; GCN-NEXT:    v_mov_b32_e32 v11, s25
-; GCN-NEXT:    v_mov_b32_e32 v12, s26
-; GCN-NEXT:    v_mov_b32_e32 v13, s27
-; GCN-NEXT:    v_mov_b32_e32 v14, s28
-; GCN-NEXT:    v_mov_b32_e32 v15, s29
+; GCN-NEXT:    v_mov_b32_e32 v13, s25
+; GCN-NEXT:    v_mov_b32_e32 v14, s26
+; GCN-NEXT:    v_mov_b32_e32 v15, s27
+; GCN-NEXT:    v_mov_b32_e32 v16, s28
+; GCN-NEXT:    v_mov_b32_e32 v17, s29
 ; GCN-NEXT:    v_mov_b32_e32 v28, s16
 ; GCN-NEXT:    v_mov_b32_e32 v29, s17
 ; GCN-NEXT:    v_mov_b32_e32 v30, s18
@@ -799,16 +796,17 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-NEXT:    v_mov_b32_e32 v33, s21
 ; GCN-NEXT:    v_mov_b32_e32 v34, s22
 ; GCN-NEXT:    v_mov_b32_e32 v35, s23
-; GCN-NEXT:    v_mov_b32_e32 v16, v0
-; GCN-NEXT:    v_mov_b32_e32 v17, v1
-; GCN-NEXT:    v_mov_b32_e32 v18, v2
-; GCN-NEXT:    v_mov_b32_e32 v19, v3
-; GCN-NEXT:    v_mov_b32_e32 v20, v4
-; GCN-NEXT:    v_mov_b32_e32 v21, v5
-; GCN-NEXT:    v_mov_b32_e32 v22, v6
-; GCN-NEXT:    v_mov_b32_e32 v23, v7
-; GCN-NEXT:    v_mov_b32_e32 v24, v8
-; GCN-NEXT:    v_mov_b32_e32 v25, v9
+; GCN-NEXT:    v_mov_b32_e32 v12, s24
+; GCN-NEXT:    v_mov_b32_e32 v18, v0
+; GCN-NEXT:    v_mov_b32_e32 v19, v1
+; GCN-NEXT:    v_mov_b32_e32 v20, v2
+; GCN-NEXT:    v_mov_b32_e32 v21, v3
+; GCN-NEXT:    v_mov_b32_e32 v22, v4
+; GCN-NEXT:    v_mov_b32_e32 v23, v5
+; GCN-NEXT:    v_mov_b32_e32 v24, v6
+; GCN-NEXT:    v_mov_b32_e32 v25, v7
+; GCN-NEXT:    v_mov_b32_e32 v26, v8
+; GCN-NEXT:    v_mov_b32_e32 v27, v9
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
 ; GCN-NEXT:    s_nop 11
@@ -1281,13 +1279,11 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v26, v10
-; SDAG-NEXT:    v_mov_b32_e32 v10, s24
-; SDAG-NEXT:    v_mov_b32_e32 v11, s25
-; SDAG-NEXT:    v_mov_b32_e32 v12, s26
-; SDAG-NEXT:    v_mov_b32_e32 v13, s27
-; SDAG-NEXT:    v_mov_b32_e32 v14, s28
-; SDAG-NEXT:    v_mov_b32_e32 v15, s29
+; SDAG-NEXT:    v_mov_b32_e32 v13, s25
+; SDAG-NEXT:    v_mov_b32_e32 v14, s26
+; SDAG-NEXT:    v_mov_b32_e32 v15, s27
+; SDAG-NEXT:    v_mov_b32_e32 v16, s28
+; SDAG-NEXT:    v_mov_b32_e32 v17, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -1296,16 +1292,17 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v16, v0
-; SDAG-NEXT:    v_mov_b32_e32 v17, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v9
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 11
@@ -2325,13 +2322,11 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v26, v10
-; SDAG-NEXT:    v_mov_b32_e32 v10, s24
-; SDAG-NEXT:    v_mov_b32_e32 v11, s25
-; SDAG-NEXT:    v_mov_b32_e32 v12, s26
-; SDAG-NEXT:    v_mov_b32_e32 v13, s27
-; SDAG-NEXT:    v_mov_b32_e32 v14, s28
-; SDAG-NEXT:    v_mov_b32_e32 v15, s29
+; SDAG-NEXT:    v_mov_b32_e32 v13, s25
+; SDAG-NEXT:    v_mov_b32_e32 v14, s26
+; SDAG-NEXT:    v_mov_b32_e32 v15, s27
+; SDAG-NEXT:    v_mov_b32_e32 v16, s28
+; SDAG-NEXT:    v_mov_b32_e32 v17, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -2340,16 +2335,17 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v16, v0
-; SDAG-NEXT:    v_mov_b32_e32 v17, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v9
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 11
@@ -2693,13 +2689,11 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v26, v10
-; SDAG-NEXT:    v_mov_b32_e32 v10, s24
-; SDAG-NEXT:    v_mov_b32_e32 v11, s25
-; SDAG-NEXT:    v_mov_b32_e32 v12, s26
-; SDAG-NEXT:    v_mov_b32_e32 v13, s27
-; SDAG-NEXT:    v_mov_b32_e32 v14, s28
-; SDAG-NEXT:    v_mov_b32_e32 v15, s29
+; SDAG-NEXT:    v_mov_b32_e32 v13, s25
+; SDAG-NEXT:    v_mov_b32_e32 v14, s26
+; SDAG-NEXT:    v_mov_b32_e32 v15, s27
+; SDAG-NEXT:    v_mov_b32_e32 v16, s28
+; SDAG-NEXT:    v_mov_b32_e32 v17, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -2708,16 +2702,17 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v16, v0
-; SDAG-NEXT:    v_mov_b32_e32 v17, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v9
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 11
@@ -3061,13 +3056,11 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v26, v10
-; SDAG-NEXT:    v_mov_b32_e32 v10, s24
-; SDAG-NEXT:    v_mov_b32_e32 v11, s25
-; SDAG-NEXT:    v_mov_b32_e32 v12, s26
-; SDAG-NEXT:    v_mov_b32_e32 v13, s27
-; SDAG-NEXT:    v_mov_b32_e32 v14, s28
-; SDAG-NEXT:    v_mov_b32_e32 v15, s29
+; SDAG-NEXT:    v_mov_b32_e32 v13, s25
+; SDAG-NEXT:    v_mov_b32_e32 v14, s26
+; SDAG-NEXT:    v_mov_b32_e32 v15, s27
+; SDAG-NEXT:    v_mov_b32_e32 v16, s28
+; SDAG-NEXT:    v_mov_b32_e32 v17, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -3076,16 +3069,17 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v16, v0
-; SDAG-NEXT:    v_mov_b32_e32 v17, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v9
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 11
@@ -3429,13 +3423,11 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v26, v10
-; SDAG-NEXT:    v_mov_b32_e32 v10, s24
-; SDAG-NEXT:    v_mov_b32_e32 v11, s25
-; SDAG-NEXT:    v_mov_b32_e32 v12, s26
-; SDAG-NEXT:    v_mov_b32_e32 v13, s27
-; SDAG-NEXT:    v_mov_b32_e32 v14, s28
-; SDAG-NEXT:    v_mov_b32_e32 v15, s29
+; SDAG-NEXT:    v_mov_b32_e32 v13, s25
+; SDAG-NEXT:    v_mov_b32_e32 v14, s26
+; SDAG-NEXT:    v_mov_b32_e32 v15, s27
+; SDAG-NEXT:    v_mov_b32_e32 v16, s28
+; SDAG-NEXT:    v_mov_b32_e32 v17, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -3444,16 +3436,17 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v16, v0
-; SDAG-NEXT:    v_mov_b32_e32 v17, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v9
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 11
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index e422e90ea0271..b9e9893ede4e2 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -373,7 +373,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v22, 0x7fc00000
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
 ; CHECK-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x3c003c00
@@ -382,65 +382,69 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x7e007e00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
+; CHECK-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[12:13], v[4:7]
-; CHECK-NEXT:    v_mov_b32_e32 v23, v22
-; CHECK-NEXT:    v_mov_b32_e32 v24, v22
-; CHECK-NEXT:    v_mov_b32_e32 v25, v22
+; CHECK-NEXT:    s_nop 1
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7]
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
 ; CHECK-NEXT:    v_accvgpr_write_b32 a2, v2
 ; CHECK-NEXT:    v_accvgpr_write_b32 a3, v3
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7]
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; CHECK-NEXT:    v_mov_b32_e32 v5, v4
+; CHECK-NEXT:    v_mov_b32_e32 v6, v4
+; CHECK-NEXT:    v_mov_b32_e32 v7, v4
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17]
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[4:7]
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[4:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b64_e32 v[30:31], 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7]
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[8:9], v[8:9], v[26:29]
 ; CHECK-NEXT:    s_nop 5
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v23, v14
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[18:21]
-; CHECK-NEXT:    global_store_short v[30:31], v23, off
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3]
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3]
+; CHECK-NEXT:    s_nop 1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v19, a3
 ; CHECK-NEXT:    v_accvgpr_read_b32 v18, a2
-; CHECK-NEXT:    buffer_wbl2 sc0 sc1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_inv sc0 sc1
+; CHECK-NEXT:    v_mov_b64_e32 v[20:21], 0
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_read_b32 v17, a1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v16, a0
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v15, v22
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[8:9], v[8:9], v[16:19]
-; CHECK-NEXT:    global_store_short v[30:31], v15, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v12, v0
+; CHECK-NEXT:    global_store_short v[20:21], v23, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3]
-; CHECK-NEXT:    global_store_short v[30:31], v14, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[10:11], v[8:9], v[4:7]
+; CHECK-NEXT:    global_store_short v[20:21], v15, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[10:11], v[8:9], v[4:7]
+; CHECK-NEXT:    global_store_short v[20:21], v14, off
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v16
-; CHECK-NEXT:    global_store_short v[30:31], v14, off
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v12, v0
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v13, v8
-; CHECK-NEXT:    global_store_short v[30:31], v12, off
+; CHECK-NEXT:    global_store_short v[20:21], v14, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CHECK-NEXT:    buffer_wbl2 sc0 sc1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_inv sc0 sc1
+; CHECK-NEXT:    global_store_short v[20:21], v12, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[30:31], v13, off
+; CHECK-NEXT:    global_store_short v[20:21], v0, off
 ; CHECK-NEXT:    s_endpgm
 entry:
   %k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
@@ -510,13 +514,13 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[6:9], v0, v1, v[2:5]
+; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5]
 ; CHECK-NEXT:    s_nop 3
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[8:9] op_sel:[1,0]
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
-; CHECK-NEXT:    v_accvgpr_write_b32 a2, v9
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, v3
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use a[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -638,14 +642,46 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
 ; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    global_store_dwordx4 v32, a[24:27], s[2:3] offset:96
-; CHECK-NEXT:    global_store_dwordx4 v32, a[28:31], s[2:3] offset:112
-; CHECK-NEXT:    global_store_dwordx4 v32, a[16:19], s[2:3] offset:64
-; CHECK-NEXT:    global_store_dwordx4 v32, a[20:23], s[2:3] offset:80
-; CHECK-NEXT:    global_store_dwordx4 v32, a[8:11], s[2:3] offset:32
-; CHECK-NEXT:    global_store_dwordx4 v32, a[12:15], s[2:3] offset:48
-; CHECK-NEXT:    global_store_dwordx4 v32, a[0:3], s[2:3]
-; CHECK-NEXT:    global_store_dwordx4 v32, a[4:7], s[2:3] offset:16
+; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
+; CHECK-NEXT:    v_accvgpr_read_b32 v24, a24
+; CHECK-NEXT:    v_accvgpr_read_b32 v25, a25
+; CHECK-NEXT:    v_accvgpr_read_b32 v26, a26
+; CHECK-NEXT:    v_accvgpr_read_b32 v27, a27
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
+; CHECK-NEXT:    v_accvgpr_read_b32 v2, a2
+; CHECK-NEXT:    v_accvgpr_read_b32 v3, a3
+; CHECK-NEXT:    v_accvgpr_read_b32 v4, a4
+; CHECK-NEXT:    v_accvgpr_read_b32 v5, a5
+; CHECK-NEXT:    v_accvgpr_read_b32 v6, a6
+; CHECK-NEXT:    v_accvgpr_read_b32 v7, a7
+; CHECK-NEXT:    v_accvgpr_read_b32 v8, a8
+; CHECK-NEXT:    v_accvgpr_read_b32 v9, a9
+; CHECK-NEXT:    v_accvgpr_read_b32 v10, a10
+; CHECK-NEXT:    v_accvgpr_read_b32 v11, a11
+; CHECK-NEXT:    v_accvgpr_read_b32 v12, a12
+; CHECK-NEXT:    v_accvgpr_read_b32 v13, a13
+; CHECK-NEXT:    v_accvgpr_read_b32 v14, a14
+; CHECK-NEXT:    v_accvgpr_read_b32 v15, a15
+; CHECK-NEXT:    v_accvgpr_read_b32 v16, a16
+; CHECK-NEXT:    v_accvgpr_read_b32 v17, a17
+; CHECK-NEXT:    v_accvgpr_read_b32 v18, a18
+; CHECK-NEXT:    v_accvgpr_read_b32 v19, a19
+; CHECK-NEXT:    v_accvgpr_read_b32 v20, a20
+; CHECK-NEXT:    v_accvgpr_read_b32 v21, a21
+; CHECK-NEXT:    v_accvgpr_read_b32 v22, a22
+; CHECK-NEXT:    v_accvgpr_read_b32 v23, a23
+; CHECK-NEXT:    v_accvgpr_read_b32 v28, a28
+; CHECK-NEXT:    v_accvgpr_read_b32 v29, a29
+; CHECK-NEXT:    v_accvgpr_read_b32 v30, a30
+; CHECK-NEXT:    v_accvgpr_read_b32 v31, a31
+; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
+; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
 ; CHECK-NEXT:    s_endpgm
   %src2 = call <32 x float> asm sideeffect "; def $0", "=a"()
   %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0)
@@ -727,18 +763,15 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v12, v31
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1]
-; CHECK-NEXT:    v_and_b32_e32 v12, 0x3ff, v12
-; CHECK-NEXT:    s_nop 2
+; CHECK-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
 ; CHECK-NEXT:    s_nop 8
 ; CHECK-NEXT:    global_store_dwordx2 v[2:3], a[0:1], off
-; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 3, v12
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0
-; CHECK-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5]
-; CHECK-NEXT:    s_nop 5
-; CHECK-NEXT:    global_store_dwordx2 v[4:5], a[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %src2 = call double asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
index e77856d073a0b..a81d9a458e23a 100644
--- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
@@ -311,44 +311,43 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[12:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    global_store_dwordx4 v6, v[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v0, v[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[56:59], s[16:17] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v0, v[56:59], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v0, v[52:55], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v0, v[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v0, v[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v0, v[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v0, v[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, v[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v0, v[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[56:59], s[16:17] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v0, a[56:59], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v0, a[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v0, a[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v0, a[52:55], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v0, a[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v0, a[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v0, a[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v6, a[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[8:11], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)

>From 04ef9a6352da4bacd8a793b86e6b2db61f4bff30 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 19 Sep 2025 23:31:46 -0400
Subject: [PATCH 04/18] Resotred SIRegisterInfo files

---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 32 -----------------------
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h   |  4 +--
 2 files changed, 1 insertion(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d5ac52997dc57..205237fefe785 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3839,38 +3839,6 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
     }
     return false;
   }
-  case AMDGPURI::HasRegisterAvoidanceList: {
-    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    ArrayRef<Register> AvoidRegs = MFI->getRegistersToAvoid(VirtReg);
-
-    if (AvoidRegs.empty())
-      return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
-                                                       MF, VRM);
-    // Collect physical registers to avoid
-    SmallSet<MCPhysReg, 32> AvoidPhysRegs;
-    for (Register AvoidReg : AvoidRegs) {
-      if (VRM && VRM->hasPhys(AvoidReg)) {
-        // Virtual register already mapped - try to avoid its physical register
-        MCPhysReg AvoidPhys = VRM->getPhys(AvoidReg);
-        for (MCRegAliasIterator AI(AvoidPhys, this, true); AI.isValid(); ++AI)
-          AvoidPhysRegs.insert(*AI);
-      }
-    }
-
-    if (AvoidPhysRegs.empty()) {
-      // No physical registers added yet - use default order
-      return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
-                                                       MF, VRM);
-    }
-
-    // Prioritize registers that don't conflict with avoided registers
-    for (MCPhysReg PhysReg : Order) {
-      if (!AvoidPhysRegs.count(PhysReg) && !MRI.isReserved(PhysReg))
-        Hints.push_back(PhysReg);
-    }
-
-    return false;
-  }
   default:
     return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
                                                      VRM);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index ed0c580abc952..7b91ba7bc581f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -31,11 +31,9 @@ class RegisterBank;
 struct SGPRSpillBuilder;
 
 /// Register allocation hint types. Helps eliminate unneeded COPY with True16
-/// HasRegisterAvoidanceList helps with minimizing usage of conflicting physical
-/// registers
 namespace AMDGPURI {
 
-enum { Size16 = 1, Size32 = 2, HasRegisterAvoidanceList = 3 };
+enum { Size16 = 1, Size32 = 2 };
 
 } // end namespace AMDGPURI
 

>From 1ef544dfc7fa2bbb9615d01ed012e2a6c7d7c668 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Fri, 19 Sep 2025 23:33:47 -0400
Subject: [PATCH 05/18] Resotred SIMachineFunctionInfo files

---
 llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 8df4c12b5a77d..45606153db58e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1212,20 +1212,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
 
   AMDGPU::ClusterDimsAttr getClusterDims() const { return ClusterDims; }
-
-  // Map of registers to avoid for a given register
-  DenseMap<Register, SmallVector<Register, 8>> RegisterAvoidanceMap;
-
-  void addRegisterToAvoid(Register VirtReg, Register AvoidReg) {
-    RegisterAvoidanceMap[VirtReg].push_back(AvoidReg);
-  }
-
-  ArrayRef<Register> getRegistersToAvoid(Register VirtReg) const {
-    auto It = RegisterAvoidanceMap.find(VirtReg);
-    if (It != RegisterAvoidanceMap.end())
-      return It->second;
-    return ArrayRef<Register>();
-  }
 };
 
 } // end namespace llvm

>From dac1e7676c91ed60dd97aea9ef1a6d6f5e32f1f5 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Sun, 21 Sep 2025 00:17:36 -0400
Subject: [PATCH 06/18] Updated sources to support anti-hint mechanism

---
 .../include/llvm/CodeGen/MIRParser/MIParser.h |  1 +
 llvm/include/llvm/CodeGen/MIRYamlMapping.h    |  3 +
 .../llvm/CodeGen/MachineRegisterInfo.h        | 56 +++++++++++++++
 llvm/lib/CodeGen/AllocationOrder.cpp          | 68 ++++++++++++++++++-
 llvm/lib/CodeGen/AllocationOrder.h            |  7 ++
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp      | 19 ++++++
 llvm/lib/CodeGen/MIRPrinter.cpp               | 11 +++
 llvm/lib/CodeGen/MachineRegisterInfo.cpp      | 27 ++++++++
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 14 +---
 9 files changed, 192 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 0f2898d3554d0..1d0a745d5f983 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -45,6 +45,7 @@ struct VRegInfo {
   } D;
   Register VReg;
   Register PreferredReg;
+  SmallVector<Register, 4> AntiHints;  // Anti-hints
   uint8_t Flags = 0;
 };
 
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index a91c26ee1122a..a700fa29d573a 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -192,6 +192,7 @@ struct VirtualRegisterDefinition {
   StringValue Class;
   StringValue PreferredRegister;
   std::vector<FlowStringValue> RegisterFlags;
+  std::vector<FlowStringValue> AntiHints;
 
   // TODO: Serialize the target specific register hints.
 
@@ -209,6 +210,8 @@ template <> struct MappingTraits<VirtualRegisterDefinition> {
                        StringValue()); // Don't print out when it's empty.
     YamlIO.mapOptional("flags", Reg.RegisterFlags,
                        std::vector<FlowStringValue>());
+    YamlIO.mapOptional("anti-hints", Reg.AntiHints,
+                       std::vector<FlowStringValue>());  // For anti-hints.
   }
 
   static const bool flow = true;
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 27b30bd5929ff..bcee5d6b30439 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -42,6 +42,7 @@
 namespace llvm {
 
 class PSetIterator;
+class VirtRegMap;
 
 /// Convenient type to represent either a register class or a register bank.
 using RegClassOrRegBank =
@@ -107,6 +108,12 @@ class MachineRegisterInfo {
              VirtReg2IndexFunctor>
       RegAllocHints;
 
+  /// AntiHintRegs - This vector records register anti-hints for
+  /// virtual registers. For each virtual register, it keeps a vector of virtual
+  /// registers that should NOT be allocated to the same or overlapping physical
+  /// registers.
+  IndexedMap<SmallVector<Register, 4>, VirtReg2IndexFunctor> AntiHintRegs;
+
   /// PhysRegUseDefLists - This is an array of the head of the use/def list for
   /// physical registers.
   std::unique_ptr<MachineOperand *[]> PhysRegUseDefLists;
@@ -860,6 +867,55 @@ class MachineRegisterInfo {
     return RegAllocHints.inBounds(VReg) ? &RegAllocHints[VReg] : nullptr;
   }
 
+  /// setRegAllocationAntiHint - Add a register allocation anti-hint for the
+  /// specified virtual register. This tells the allocator to avoid allocating
+  /// VReg to the same physical register as AntiHintVReg (or overlapping ones).
+  void setRegAllocationAntiHint(Register VReg, Register AntiHintVReg) {
+    assert(VReg.isVirtual() && "Anti-hints are only for virtual registers");
+    assert(AntiHintVReg.isVirtual() && "Anti-hint target must be virtual");
+    AntiHintRegs.grow(Register::index2VirtReg(getNumVirtRegs()));
+    auto &AntiHints = AntiHintRegs[VReg];
+    // Avoid duplicates
+    if (llvm::find(AntiHints, AntiHintVReg) == AntiHints.end())
+      AntiHints.push_back(AntiHintVReg);
+  }
+
+  /// addRegAllocationAntiHint - Add multiple anti-hints at once
+  void addRegAllocationAntiHints(Register VReg, ArrayRef<Register> AntiHintVRegs) {
+    for (Register AntiHint : AntiHintVRegs)
+      setRegAllocationAntiHint(VReg, AntiHint);
+  }
+
+  /// clearRegAllocationAntiHints - Clear all anti-hints for a register
+  void clearRegAllocationAntiHints(Register VReg) {
+    assert(VReg.isVirtual());
+    if (AntiHintRegs.inBounds(VReg))
+      AntiHintRegs[VReg].clear();
+  }
+
+  /// getRegAllocationAntiHints - Return the vector of anti-hints for VReg
+  ArrayRef<Register> getRegAllocationAntiHints(Register VReg) const {
+    assert(VReg.isVirtual());
+    if (!AntiHintRegs.inBounds(VReg))
+      return ArrayRef<Register>();
+    return AntiHintRegs[VReg];
+  }
+
+  /// hasRegAllocationAntiHint - Check if VReg has AntiHintVReg as an anti-hint
+  bool hasRegAllocationAntiHint(Register VReg, Register AntiHintVReg) const {
+    assert(VReg.isVirtual() && AntiHintVReg.isVirtual());
+    if (!AntiHintRegs.inBounds(VReg))
+      return false;
+    const auto &AntiHints = AntiHintRegs[VReg];
+    return llvm::find(AntiHints, AntiHintVReg) != AntiHints.end();
+  }
+
+  /// getPhysRegAntiHints - Get the set of physical registers to avoid based on
+  /// anti-hints and current allocations. This is called during allocation.
+  /// VRM is the current virtual register map showing allocations made so far.
+  void getPhysRegAntiHints(Register VReg, SmallVectorImpl<MCPhysReg> &PhysAntiHints,
+                          const VirtRegMap *VRM) const;
+
   /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the
   /// specified register as undefined which causes the DBG_VALUE to be
   /// deleted during LiveDebugVariables analysis.
diff --git a/llvm/lib/CodeGen/AllocationOrder.cpp b/llvm/lib/CodeGen/AllocationOrder.cpp
index 183dc8af1b91b..f57df79128c64 100644
--- a/llvm/lib/CodeGen/AllocationOrder.cpp
+++ b/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -31,6 +31,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
                                         const LiveRegMatrix *Matrix) {
   const MachineFunction &MF = VRM.getMachineFunction();
   const TargetRegisterInfo *TRI = &VRM.getTargetRegInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   auto Order = RegClassInfo.getOrder(MF.getRegInfo().getRegClass(VirtReg));
   SmallVector<MCPhysReg, 16> Hints;
   bool HardHints =
@@ -44,8 +45,69 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
       dbgs() << '\n';
     }
   });
-  assert(all_of(Hints,
-                [&](MCPhysReg Hint) { return is_contained(Order, Hint); }) &&
+
+  // Get anti-hints
+  SmallVector<MCPhysReg, 16> AntiHintedPhysRegs;
+  MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, &VRM);
+  
+  LLVM_DEBUG({
+    if (!AntiHintedPhysRegs.empty()) {
+      dbgs() << "anti-hints:";
+      for (MCPhysReg AntiHint : AntiHintedPhysRegs)
+        dbgs() << ' ' << printReg(AntiHint, TRI);
+      dbgs() << '\n';
+    }
+  });
+  
+  // Create allocation order object
+  AllocationOrder AO(std::move(Hints), Order, HardHints);
+  
+  // Apply anti-hint filtering if needed
+  if (!AntiHintedPhysRegs.empty()) {
+    AO.applyAntiHints(AntiHintedPhysRegs, TRI);
+    
+    LLVM_DEBUG({
+      if (!AO.Hints.empty()) {
+        dbgs() << "filtered hints:";
+        for (MCPhysReg Hint : AO.Hints)
+          dbgs() << ' ' << printReg(Hint, TRI);
+        dbgs() << '\n';
+      }
+    });
+  }
+
+
+  assert(all_of(AO.Hints,
+                [&](MCPhysReg Hint) { return is_contained(AO.Order, Hint); }) &&
          "Target hint is outside allocation order.");
-  return AllocationOrder(std::move(Hints), Order, HardHints);
+  return AO;
+}
+
+void AllocationOrder::applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs, 
+                                     const TargetRegisterInfo *TRI) {
+  // Create filtered order
+  FilteredOrderStorage.clear();
+  FilteredOrderStorage.reserve(Order.size());
+  
+  // Add non-anti-hinted registers first
+  for (MCPhysReg PhysReg : Order) {
+    if (!is_contained(AntiHintedPhysRegs, PhysReg)) {
+      FilteredOrderStorage.push_back(PhysReg);
+    }
+  }
+  
+  // Add anti-hinted registers at the end as last resort
+  for (MCPhysReg PhysReg : Order) {
+    if (is_contained(AntiHintedPhysRegs, PhysReg)) {
+      FilteredOrderStorage.push_back(PhysReg);
+    }
+  }
+  
+  // Update Order to point to our filtered storage
+  Order = FilteredOrderStorage;
+  
+  LLVM_DEBUG({
+    dbgs() << "moved " << AntiHintedPhysRegs.size() 
+           << " anti-hinted registers to end of allocation order\n";
+  });
 }
diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h
index 3dd02c3b14d3a..842f83d957a6d 100644
--- a/llvm/lib/CodeGen/AllocationOrder.h
+++ b/llvm/lib/CodeGen/AllocationOrder.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 
 namespace llvm {
 
@@ -29,6 +30,7 @@ class LiveRegMatrix;
 
 class LLVM_LIBRARY_VISIBILITY AllocationOrder {
   const SmallVector<MCPhysReg, 16> Hints;
+  SmallVector<MCPhysReg, 16> FilteredOrderStorage;
   ArrayRef<MCPhysReg> Order;
   // How far into the Order we can iterate. This is 0 if the AllocationOrder is
   // constructed with HardHints = true, Order.size() otherwise. While
@@ -117,6 +119,11 @@ class LLVM_LIBRARY_VISIBILITY AllocationOrder {
                static_cast<uint32_t>(std::numeric_limits<MCPhysReg>::max()));
     return Reg.isPhysical() && is_contained(Hints, Reg.id());
   }
+  
+  /// Apply antihint to the allocation order.
+  void applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs, 
+                      const TargetRegisterInfo *TRI);
+
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index bb70e7805e818..1110823a2ca5a 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -735,6 +735,20 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
                          FlagStringValue.Value + "'");
       Info.Flags |= FlagValue;
     }
+
+    for (const auto &AntiHintValue : VReg.AntiHints) {
+      if (Info.Kind != VRegInfo::NORMAL)
+        return error(VReg.Class.SourceRange.Start,
+              Twine("anti-hints can only be set for normal vregs"));
+
+      Register AntiHintReg;
+      if (parseRegisterReference(PFS, AntiHintReg,
+                                 AntiHintValue.Value, Error))
+        return error(Error, AntiHintValue.SourceRange);
+      
+      Info.AntiHints.push_back(AntiHintReg);
+    }
+
     RegInfo.noteNewVirtualRegister(Info.VReg);
   }
 
@@ -801,6 +815,11 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS,
       MRI.setRegClass(Reg, Info.D.RC);
       if (Info.PreferredReg != 0)
         MRI.setSimpleHint(Reg, Info.PreferredReg);
+
+      for (Register AntiHint : Info.AntiHints) {
+        if (AntiHint != 0)
+          MRI.setRegAllocationAntiHint(Reg, AntiHint);
+      }
       break;
     case VRegInfo::GENERIC:
       break;
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 91a21a4adf4eb..51f7d506b3e99 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -313,6 +313,17 @@ static void convertMRI(yaml::MachineFunction &YamlMF, const MachineFunction &MF,
     if (PreferredReg)
       printRegMIR(PreferredReg, VReg.PreferredRegister, TRI);
     printRegFlags(Reg, VReg.RegisterFlags, MF, TRI);
+    // Print the anti-hints.
+    const auto &AntiHints = RegInfo.getRegAllocationAntiHints(Reg);
+    if (!AntiHints.empty()) {
+      std::vector<yaml::FlowStringValue> AntiHintStrings;
+      for (Register AntiHint : AntiHints) {
+        yaml::FlowStringValue AntiHintStr;
+        printRegMIR(AntiHint, AntiHintStr, TRI);
+        AntiHintStrings.push_back(std::move(AntiHintStr));
+      }
+      VReg.AntiHints = std::move(AntiHintStrings);
+    }
     YamlMF.VirtualRegisters.push_back(std::move(VReg));
   }
 
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index abb3f3e612000..c169315c555d5 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DebugLoc.h"
@@ -676,3 +677,29 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
   }
   return false;
 }
+
+void MachineRegisterInfo::getPhysRegAntiHints(Register VReg, 
+                                             SmallVectorImpl<MCPhysReg> &PhysAntiHints,
+                                             const VirtRegMap *VRM) const {
+  assert(VReg.isVirtual());
+  if (!AntiHintRegs.inBounds(VReg) || !VRM)
+    return;
+  
+  const auto &AntiHints = AntiHintRegs[VReg];
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  
+  for (Register AntiHintVReg : AntiHints) {
+    // Check if the anti-hinted register has been allocated
+    if (VRM->hasPhys(AntiHintVReg)) {
+      MCPhysReg PhysReg = VRM->getPhys(AntiHintVReg);
+      // Add the physical register and all its aliases
+      for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) {
+        PhysAntiHints.push_back(*AI);
+      }
+    }
+  }
+  
+  // Remove duplicates
+  llvm::sort(PhysAntiHints);
+  PhysAntiHints.erase(llvm::unique(PhysAntiHints), PhysAntiHints.end());
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index ed349fccfa3e4..1a8cd84f7640a 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -323,17 +323,9 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
               // Check if MFMA register is dead at current instruction
               const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg);
               if (!MFMAInterval.liveAt(CurrentSlot)) {
-
-                // Add bidirectional avoidance hint
-                MFI->addRegisterToAvoid(CandidateReg, MFMAReg);
-                MFI->addRegisterToAvoid(MFMAReg, CandidateReg);
-
-                // Set hint if we found registers to avoid
-                MRI->setRegAllocationHint(
-                    MFMAReg, AMDGPURI::HasRegisterAvoidanceList, Register());
-                MRI->setRegAllocationHint(CandidateReg,
-                                          AMDGPURI::HasRegisterAvoidanceList,
-                                          Register());
+                // Add bidirectional antihints
+                MRI->addRegAllocationAntiHints(CandidateReg, MFMARegs);
+                MRI->addRegAllocationAntiHints(MFMAReg, CandidateReg);
               }
             }
           }

>From f41a126c6c6342fa6fb8c3ed8c4c83ad50bd505a Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Sun, 21 Sep 2025 01:58:40 -0400
Subject: [PATCH 07/18] Made anti-hints map conditional in MIRYamlMapping

---
 llvm/include/llvm/CodeGen/MIRYamlMapping.h       | 6 ++++--
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index a700fa29d573a..3e4b57da91479 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -210,8 +210,10 @@ template <> struct MappingTraits<VirtualRegisterDefinition> {
                        StringValue()); // Don't print out when it's empty.
     YamlIO.mapOptional("flags", Reg.RegisterFlags,
                        std::vector<FlowStringValue>());
-    YamlIO.mapOptional("anti-hints", Reg.AntiHints,
-                       std::vector<FlowStringValue>());  // For anti-hints.
+    if(!YamlIO.outputting() || !Reg.AntiHints.empty()) {  // Only map when parsing or anti-hints present
+      YamlIO.mapOptional("anti-hints", Reg.AntiHints,
+                       std::vector<FlowStringValue>());  // for anti-hints
+    }
   }
 
   static const bool flow = true;
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 1a8cd84f7640a..f63eea716d68b 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -48,7 +48,7 @@ static cl::opt<bool> EnableRegisterAvoidListForMFMARegs(
     "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden,
     cl::desc("Enable Register Avoidance for "
              "MFMA in GCNPreRAOptimizations stage."),
-    cl::init(false));
+    cl::init(true));
 
 namespace {
 

>From c819285cd35d7f49c95494ec681c1b100a1160cd Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Sun, 21 Sep 2025 02:14:53 -0400
Subject: [PATCH 08/18] Updated tests

---
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir |  538 +++--
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir |  542 ++---
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll |  204 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll |  120 +-
 .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll    |   62 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll |  464 ++--
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  |  150 +-
 ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll |  271 +--
 .../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll    |   12 +-
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       | 1897 ++++++++++-------
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       |  124 +-
 .../unspill-vgpr-after-rewrite-vgpr-mfma.ll   |  170 +-
 12 files changed, 2477 insertions(+), 2077 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index b07dec326327e..3d9be93573ac9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -15,9 +15,12 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr106
   ; GCN-NEXT:    ; implicit-def: $vgpr132
+  ; GCN-NEXT:    ; implicit-def: $vgpr112
+  ; GCN-NEXT:    ; implicit-def: $vgpr113
+  ; GCN-NEXT:    ; implicit-def: $vgpr114
+  ; GCN-NEXT:    ; implicit-def: $vgpr115
   ; GCN-NEXT:    ; implicit-def: $vgpr133
   ; GCN-NEXT:    ; implicit-def: $vgpr139
-  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -167,46 +170,45 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
-  ; GCN-NEXT:    ; implicit-def: $vgpr64
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
-  ; GCN-NEXT:    ; implicit-def: $vgpr73
-  ; GCN-NEXT:    v_add_u32_e32 v76, v132, v64
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
+  ; GCN-NEXT:    v_add_u32_e32 v73, v132, v112
   ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; kill: killed $vgpr72
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v73
-  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v113
+  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v73, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr74
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v74
-  ; GCN-NEXT:    ; implicit-def: $vgpr75
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v114
   ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v75
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v115
   ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; kill: killed $vgpr73
   ; GCN-NEXT:    ds_read_b128 v[72:75], v94
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ; implicit-def: $sgpr8
+  ; GCN-NEXT:    ; implicit-def: $vgpr112
+  ; GCN-NEXT:    ; implicit-def: $vgpr113
+  ; GCN-NEXT:    ; implicit-def: $vgpr114
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
   ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
@@ -411,8 +413,6 @@
   ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
   ; GCN-NEXT:    ; implicit-def: $vgpr65
   ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    ; implicit-def: $vgpr68
-  ; GCN-NEXT:    ; implicit-def: $vgpr67
   ; GCN-NEXT:    v_add_u32_e32 v65, s7, v65
   ; GCN-NEXT:    v_and_b32_e32 v65, 0x1fffffff, v65
   ; GCN-NEXT:    v_mul_lo_u32 v65, v65, s6
@@ -440,40 +440,36 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v138, v[96:97]
-  ; GCN-NEXT:    v_add_u32_e32 v68, v132, v68
+  ; GCN-NEXT:    ; implicit-def: $vgpr96
   ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[6:7]
   ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
   ; GCN-NEXT:    ; implicit-def: $vgpr65
   ; GCN-NEXT:    v_max_f32_e32 v66, v65, v65
   ; GCN-NEXT:    v_max_f32_e32 v134, v66, v64
-  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v96
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[160:161], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v64
-  ; GCN-NEXT:    buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v112
+  ; GCN-NEXT:    buffer_load_dwordx2 v[162:163], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v66
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v113
   ; GCN-NEXT:    buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v67
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v114
   ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
   ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v134
-  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
   ; GCN-NEXT:    v_fma_f32 v64, s4, v49, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v163, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
   ; GCN-NEXT:    v_fma_f32 v66, s4, v50, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v164, v57
+  ; GCN-NEXT:    v_exp_f32_e32 v165, v57
   ; GCN-NEXT:    v_exp_f32_e32 v49, v48
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64
   ; GCN-NEXT:    v_fma_f32 v67, s4, v51, -v134
@@ -499,31 +495,30 @@
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v70
   ; GCN-NEXT:    v_exp_f32_e32 v55, v48
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v71
-  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_fma_f32 v66, s4, v56, -v134
   ; GCN-NEXT:    v_exp_f32_e32 v56, v48
   ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v134
+  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v49
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v67, v50
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v51
+  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v58, v52
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
   ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_exp_f32_e32 v48, v48
-  ; GCN-NEXT:    v_pack_b32_f16 v161, v68, v58
-  ; GCN-NEXT:    v_pack_b32_f16 v160, v64, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v58, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    v_fma_f32 v156, s4, v59, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v59, v68, v58
+  ; GCN-NEXT:    v_pack_b32_f16 v58, v64, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v80, 0x3fb8aa3b, v66
   ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ds_read_b128 v[152:155], v139 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v56
   ; GCN-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
@@ -532,288 +527,287 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
+  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GCN-NEXT:    v_fma_f32 v157, s4, v60, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[58:59], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v141, v80
   ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
-  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134
+  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e64 v82, v82, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v83, v83, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v84, v84, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v85, v85, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v86, v86, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v87, v87, v48
+  ; GCN-NEXT:    v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
-  ; GCN-NEXT:    v_exp_f32_e32 v58, v58
-  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
-  ; GCN-NEXT:    v_mul_f32_e64 v98, v98, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v99, v99, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v100, v100, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v101, v101, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v102, v102, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v103, v103, v48
+  ; GCN-NEXT:    v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v59
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54
-  ; GCN-NEXT:    v_exp_f32_e32 v59, v57
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
-  ; GCN-NEXT:    v_fma_f32 v60, s4, v60, -v134
-  ; GCN-NEXT:    v_mul_f32_e64 v112, v112, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v113, v113, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v114, v114, v48
-  ; GCN-NEXT:    v_mul_f32_e64 v115, v115, v48
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[58:59], v[80:95]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v144, v54
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v145, v55
+  ; GCN-NEXT:    v_exp_f32_e32 v167, v57
+  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+  ; GCN-NEXT:    v_mul_f32_e32 v168, 0x3fb8aa3b, v157
+  ; GCN-NEXT:    v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[58:59], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v148, v56
+  ; GCN-NEXT:    v_mul_f32_e64 v118, v118, v48
+  ; GCN-NEXT:    v_mul_f32_e64 v119, v119, v48
+  ; GCN-NEXT:    v_mul_f32_e64 v120, v120, v48
+  ; GCN-NEXT:    v_mul_f32_e64 v121, v121, v48
   ; GCN-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_fma_f32 v148, s4, v62, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v149, v145, v148
+  ; GCN-NEXT:    v_pack_b32_f16 v148, v140, v144
+  ; GCN-NEXT:    v_mul_f32_e32 v140, 0x3fb8aa3b, v156
+  ; GCN-NEXT:    v_exp_f32_e32 v168, v168
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[58:59], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v153, v140
+  ; GCN-NEXT:    ; implicit-def: $vgpr140
+  ; GCN-NEXT:    v_fma_f32 v164, s4, v61, -v134
+  ; GCN-NEXT:    v_fma_f32 v166, s4, v62, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v169, v141
   ; GCN-NEXT:    v_fma_f32 v152, s4, v63, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v60
-  ; GCN-NEXT:    ; implicit-def: $vgpr57
-  ; GCN-NEXT:    ds_read_b128 v[60:63], v57
+  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v35, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[148:149], v[64:79]
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v140
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v160, v149
-  ; GCN-NEXT:    v_fma_f32 v161, s4, v33, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v148
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
-  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
-  ; GCN-NEXT:    ds_read_b128 v[140:143], v57 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[156:159], v140 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_fma_f32 v40, s4, v40, -v134
   ; GCN-NEXT:    v_fma_f32 v44, s4, v44, -v134
   ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v134
-  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134
   ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
-  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v162
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v163
-  ; GCN-NEXT:    v_exp_f32_e32 v162, v146
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v164
   ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v148, v153, v147
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[148:149], v[80:95]
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v164
+  ; GCN-NEXT:    v_fma_f32 v164, s4, v33, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v166
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v165
+  ; GCN-NEXT:    v_exp_f32_e32 v170, v146
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v167
   ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[148:149], v[96:111]
   ; GCN-NEXT:    v_exp_f32_e32 v151, v33
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v59
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v153
+  ; GCN-NEXT:    v_pack_b32_f16 v62, v169, v147
   ; GCN-NEXT:    v_fma_f32 v150, s4, v34, -v134
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v149, v146, v33
+  ; GCN-NEXT:    v_perm_b32 v147, v131, v129, s8
+  ; GCN-NEXT:    v_pack_b32_f16 v63, v146, v33
   ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v152
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
-  ; GCN-NEXT:    v_fma_f32 v152, s4, v35, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v153, v33
-  ; GCN-NEXT:    v_fma_f32 v155, s4, v36, -v134
-  ; GCN-NEXT:    v_perm_b32 v36, v158, v156, s5
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v154, v160
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v60, 0x3fb8aa3b, v32
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[144:147], v57 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v61, 0x3fb8aa3b, v161
-  ; GCN-NEXT:    v_exp_f32_e32 v165, v60
-  ; GCN-NEXT:    v_perm_b32 v60, v158, v156, s8
-  ; GCN-NEXT:    v_fma_f32 v158, s4, v37, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v161, v61
-  ; GCN-NEXT:    v_perm_b32 v140, v159, v157, s8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[148:149], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v148, v33
+  ; GCN-NEXT:    v_fma_f32 v152, s4, v36, -v134
+  ; GCN-NEXT:    v_perm_b32 v36, v162, v160, s5
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v149, v168
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v155, v170
+  ; GCN-NEXT:    v_perm_b32 v146, v163, v161, s8
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[62:63], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v140 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[58:61], v140 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v164
+  ; GCN-NEXT:    v_exp_f32_e32 v154, v142
+  ; GCN-NEXT:    v_perm_b32 v142, v162, v160, s8
+  ; GCN-NEXT:    v_fma_f32 v160, s4, v38, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[156:157], v[62:63], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v157, v143
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v148
+  ; GCN-NEXT:    v_fma_f32 v156, s4, v37, -v134
   ; GCN-NEXT:    v_perm_b32 v37, v130, v128, s5
-  ; GCN-NEXT:    v_perm_b32 v61, v130, v128, s8
-  ; GCN-NEXT:    v_perm_b32 v141, v131, v129, s8
+  ; GCN-NEXT:    v_perm_b32 v143, v130, v128, s8
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b64 v135, v[36:37]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v32, v159, v157, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[62:63], v[96:111]
   ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v150
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v150, v151
-  ; GCN-NEXT:    v_fma_f32 v157, s4, v38, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v153
-  ; GCN-NEXT:    v_exp_f32_e32 v159, v33
+  ; GCN-NEXT:    v_perm_b32 v32, v163, v161, s5
+  ; GCN-NEXT:    v_exp_f32_e32 v161, v33
   ; GCN-NEXT:    v_perm_b32 v33, v131, v129, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v129, v150, v38
-  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v152
-  ; GCN-NEXT:    v_exp_f32_e32 v152, v38
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[60:61]
+  ; GCN-NEXT:    ds_write_b64 v136, v[142:143]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v137, v[32:33]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[58:59], v[62:63], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v59, v150, v38
+  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_pack_b32_f16 v58, v149, v155
+  ; GCN-NEXT:    v_exp_f32_e32 v149, v38
   ; GCN-NEXT:    ; implicit-def: $vgpr33
   ; GCN-NEXT:    ; implicit-def: $vgpr38
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v138, v[140:141]
+  ; GCN-NEXT:    ds_write_b64 v138, v[146:147]
   ; GCN-NEXT:    v_add_u32_e32 v38, v132, v38
   ; GCN-NEXT:    v_add_u32_e32 v33, v132, v33
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[62:63], v38, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[142:143], v33, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; implicit-def: $vgpr36
   ; GCN-NEXT:    v_add_u32_e32 v33, v132, v36
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[144:145], v[58:59], v[64:79]
   ; GCN-NEXT:    ; implicit-def: $vgpr37
   ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_add_u32_e32 v33, v132, v37
-  ; GCN-NEXT:    buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[146:147], v33, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v156, v162
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v152
+  ; GCN-NEXT:    v_exp_f32_e32 v150, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v165
-  ; GCN-NEXT:    v_pack_b32_f16 v128, v154, v156
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v39, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[158:159], v[58:59], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v160
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v154
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v152, v157
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v39, -v134
   ; GCN-NEXT:    ds_read_b128 v[36:39], v139
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
-  ; GCN-NEXT:    v_exp_f32_e32 v154, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
-  ; GCN-NEXT:    ds_read_b128 v[60:63], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v156, s4, v42, -v134
-  ; GCN-NEXT:    v_perm_b32 v20, v140, v130, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v155, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v157
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v142, v161
-  ; GCN-NEXT:    v_fma_f32 v143, s4, v41, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v159
-  ; GCN-NEXT:    v_exp_f32_e32 v157, v32
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v152
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v129, v34, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_pack_b32_f16 v128, v33, v142
-  ; GCN-NEXT:    v_exp_f32_e32 v146, v32
+  ; GCN-NEXT:    ds_read_b128 v[128:131], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[58:59], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v161
+  ; GCN-NEXT:    v_exp_f32_e32 v159, v32
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v149
+  ; GCN-NEXT:    v_fma_f32 v155, s4, v41, -v134
+  ; GCN-NEXT:    v_fma_f32 v158, s4, v42, -v134
+  ; GCN-NEXT:    v_fma_f32 v162, s4, v20, -v134
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[60:61], v[58:59], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v59, v34, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_pack_b32_f16 v58, v33, v152
+  ; GCN-NEXT:    v_exp_f32_e32 v60, v32
   ; GCN-NEXT:    ds_read_b128 v[32:35], v139 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v142, s4, v43, -v134
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v46, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v43, -v134
+  ; GCN-NEXT:    v_perm_b32 v20, v142, v62, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[58:59], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v40
   ; GCN-NEXT:    ds_read_b128 v[40:43], v139 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v147, v36
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v143
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v154
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v143, v36
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v155
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v142
-  ; GCN-NEXT:    v_fma_f32 v61, s4, v45, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v157
-  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v146
+  ; GCN-NEXT:    v_exp_f32_e32 v61, v36
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v150
+  ; GCN-NEXT:    v_fma_f32 v155, s4, v46, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[128:129], v[58:59], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v152, v36
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v128, v156
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_fma_f32 v129, s4, v45, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[58:59], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v159
+  ; GCN-NEXT:    v_exp_f32_e32 v158, v32
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v60
   ; GCN-NEXT:    v_pack_b32_f16 v33, v33, v32
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v60
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v129, v36
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[58:59], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v57, v36
   ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v44
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v147
-  ; GCN-NEXT:    v_fma_f32 v128, s4, v47, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v59, v61
+  ; GCN-NEXT:    v_fma_f32 v58, s4, v47, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v57
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v140
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v142, v40
-  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v61
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v143
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v57 offset:576
+  ; GCN-NEXT:    v_exp_f32_e32 v128, v40
+  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v129
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v129, v152
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v140 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
-  ; GCN-NEXT:    v_fma_f32 v62, s4, v17, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_exp_f32_e32 v63, v40
-  ; GCN-NEXT:    v_pack_b32_f16 v40, v60, v61
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v18, -v134
-  ; GCN-NEXT:    v_fma_f32 v60, s4, v19, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v142
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[130:131], v[32:33], v[80:95]
+  ; GCN-NEXT:    v_fma_f32 v130, s4, v17, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    v_exp_f32_e32 v131, v40
+  ; GCN-NEXT:    v_pack_b32_f16 v40, v59, v129
+  ; GCN-NEXT:    v_fma_f32 v155, s4, v18, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v59, v128
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v156
-  ; GCN-NEXT:    v_exp_f32_e32 v158, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v129
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v158
+  ; GCN-NEXT:    v_exp_f32_e32 v160, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v57
   ; GCN-NEXT:    v_pack_b32_f16 v41, v34, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v128
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v58
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v128, v17
-  ; GCN-NEXT:    v_perm_b32 v42, v141, v131, s8
-  ; GCN-NEXT:    v_perm_b32 v43, v149, v145, s8
+  ; GCN-NEXT:    v_fma_f32 v58, s4, v19, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v129, v17
+  ; GCN-NEXT:    v_perm_b32 v42, v143, v63, s8
+  ; GCN-NEXT:    v_perm_b32 v43, v147, v145, s8
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v16
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v140 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v140 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v62
-  ; GCN-NEXT:    v_exp_f32_e32 v167, v36
-  ; GCN-NEXT:    v_perm_b32 v36, v140, v130, s8
+  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v130
+  ; GCN-NEXT:    v_exp_f32_e32 v163, v36
+  ; GCN-NEXT:    v_perm_b32 v36, v142, v62, s8
   ; GCN-NEXT:    v_fma_f32 v62, s4, v21, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
   ; GCN-NEXT:    v_exp_f32_e32 v130, v37
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v158
-  ; GCN-NEXT:    v_perm_b32 v21, v148, v144, s5
-  ; GCN-NEXT:    v_perm_b32 v37, v148, v144, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v160
+  ; GCN-NEXT:    v_perm_b32 v21, v146, v144, s5
+  ; GCN-NEXT:    v_perm_b32 v37, v146, v144, s8
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v131
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b64 v135, v[20:21]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v16, v141, v131, s5
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v22, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v128
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_exp_f32_e32 v140, v17
-  ; GCN-NEXT:    v_perm_b32 v17, v149, v145, s5
+  ; GCN-NEXT:    v_perm_b32 v16, v143, v63, s5
+  ; GCN-NEXT:    v_fma_f32 v63, s4, v22, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    v_exp_f32_e32 v142, v17
+  ; GCN-NEXT:    v_perm_b32 v17, v147, v145, s5
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v136, v[36:37]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
   ; GCN-NEXT:    v_pack_b32_f16 v33, v45, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v60
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v58
   ; GCN-NEXT:    v_exp_f32_e32 v144, v22
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -836,22 +830,22 @@
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_add_u32_e32 v20, v132, v20
   ; GCN-NEXT:    v_add_u32_e32 v21, v132, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v59, v44
   ; GCN-NEXT:    buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[58:59], v21, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v166
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v162
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
   ; GCN-NEXT:    v_exp_f32_e32 v132, v16
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v62
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v167
-  ; GCN-NEXT:    v_fma_f32 v141, s4, v23, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v163
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v23, -v134
   ; GCN-NEXT:    ds_read_b128 v[20:23], v139
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -860,20 +854,20 @@
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
   ; GCN-NEXT:    v_exp_f32_e32 v62, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v63
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v46, v130
   ; GCN-NEXT:    v_fma_f32 v47, s4, v25, -v134
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v26, -v134
-  ; GCN-NEXT:    v_fma_f32 v149, s4, v4, -v134
+  ; GCN-NEXT:    v_fma_f32 v63, s4, v26, -v134
+  ; GCN-NEXT:    v_fma_f32 v147, s4, v4, -v134
   ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    v_perm_b32 v4, v42, v40, s5
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v140
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v142
   ; GCN-NEXT:    v_exp_f32_e32 v145, v16
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v144
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
   ; GCN-NEXT:    v_pack_b32_f16 v33, v18, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v141
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v143
   ; GCN-NEXT:    v_pack_b32_f16 v32, v17, v46
   ; GCN-NEXT:    v_exp_f32_e32 v35, v16
   ; GCN-NEXT:    ds_read_b128 v[16:19], v139 offset:1152
@@ -895,11 +889,11 @@
   ; GCN-NEXT:    v_fma_f32 v37, s4, v29, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v46
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v63
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v145
-  ; GCN-NEXT:    v_exp_f32_e32 v141, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v143, v16
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v35
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v30, -v134
+  ; GCN-NEXT:    v_fma_f32 v63, s4, v30, -v134
   ; GCN-NEXT:    v_pack_b32_f16 v17, v17, v16
   ; GCN-NEXT:    v_pack_b32_f16 v16, v21, v36
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
@@ -907,25 +901,25 @@
   ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v28
   ; GCN-NEXT:    v_fma_f32 v32, s4, v31, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[20:23], v57
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v140
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_exp_f32_e32 v36, v24
   ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v37
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v47
-  ; GCN-NEXT:    ds_read_b128 v[28:31], v57 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[28:31], v140 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
   ; GCN-NEXT:    v_fma_f32 v38, s4, v1, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v63
   ; GCN-NEXT:    v_exp_f32_e32 v39, v24
   ; GCN-NEXT:    v_pack_b32_f16 v24, v34, v37
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v2, -v134
+  ; GCN-NEXT:    v_fma_f32 v63, s4, v2, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v36
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v141
-  ; GCN-NEXT:    v_exp_f32_e32 v148, v1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v143
+  ; GCN-NEXT:    v_exp_f32_e32 v146, v1
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v33
   ; GCN-NEXT:    v_pack_b32_f16 v25, v18, v1
   ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v32
@@ -933,25 +927,25 @@
   ; GCN-NEXT:    v_fma_f32 v32, s4, v3, -v134
   ; GCN-NEXT:    v_exp_f32_e32 v34, v1
   ; GCN-NEXT:    v_perm_b32 v26, v43, v41, s8
-  ; GCN-NEXT:    v_perm_b32 v27, v61, v45, s8
+  ; GCN-NEXT:    v_perm_b32 v27, v59, v45, s8
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v140 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v140 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v38
-  ; GCN-NEXT:    v_exp_f32_e32 v150, v20
+  ; GCN-NEXT:    v_exp_f32_e32 v155, v20
   ; GCN-NEXT:    v_perm_b32 v20, v42, v40, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v148
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v146
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
   ; GCN-NEXT:    v_exp_f32_e32 v38, v21
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v39
   ; GCN-NEXT:    v_fma_f32 v29, s4, v5, -v134
-  ; GCN-NEXT:    v_perm_b32 v5, v60, v44, s5
-  ; GCN-NEXT:    v_perm_b32 v21, v60, v44, s8
+  ; GCN-NEXT:    v_perm_b32 v5, v58, v44, s5
+  ; GCN-NEXT:    v_perm_b32 v21, v58, v44, s8
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
@@ -961,9 +955,9 @@
   ; GCN-NEXT:    v_perm_b32 v0, v43, v41, s5
   ; GCN-NEXT:    v_fma_f32 v41, s4, v6, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v34
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v63
   ; GCN-NEXT:    v_exp_f32_e32 v42, v1
-  ; GCN-NEXT:    v_perm_b32 v1, v61, v45, s5
+  ; GCN-NEXT:    v_perm_b32 v1, v59, v45, s5
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v136, v[20:21]
@@ -987,10 +981,10 @@
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v147
   ; GCN-NEXT:    v_exp_f32_e32 v26, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v29
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v150
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v155
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v38
   ; GCN-NEXT:    ds_read_b128 v[20:23], v139 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1042,10 +1036,10 @@
   ; GCN-NEXT:    v_exp_f32_e32 v21, v9
   ; GCN-NEXT:    v_fma_f32 v8, s4, v15, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v57
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v140
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[12:15], v57 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[12:15], v140 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24
@@ -1071,33 +1065,33 @@
   ; GCN-NEXT:    v_add_f32_e32 v3, v54, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v55, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v56, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v58, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v164, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v59, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v162, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v151, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v165, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v161, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v159, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v152, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v168, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v170, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v151, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v154, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v157, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v147, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v161, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v149, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v156, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v63, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v159, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v60, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v61, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v152, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v158, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v57, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v128, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v131, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v130, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v140, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v144, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v132, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v62, v3
@@ -1105,14 +1099,14 @@
   ; GCN-NEXT:    v_add_f32_e32 v3, v35, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v46, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v47, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v33, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v36, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v39, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
   ; GCN-NEXT:    v_add_f32_e32 v3, v34, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2
   ; GCN-NEXT:    v_add_f32_e32 v3, v38, v3
@@ -1137,7 +1131,7 @@
   ; GCN-NEXT:    v_add_f32_e32 v4, v10, v0
   ; GCN-NEXT:    ds_bpermute_b32 v5, v133, v4
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v140 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_add_f32_e32 v2, v4, v5
@@ -1147,7 +1141,7 @@
   ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[6:7]
   ; GCN-NEXT:    ; implicit-def: $vgpr4
   ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v48
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v140 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0887fdf0844b0..be97a1e82fcf2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -10,25 +10,24 @@
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
   ; GCN-NEXT:    v_readfirstlane_b32 s20, v2
   ; GCN-NEXT:    ; implicit-def: $sgpr4
-  ; GCN-NEXT:    ; implicit-def: $vgpr3
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
   ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:    ; implicit-def: $vgpr50
+  ; GCN-NEXT:    ; implicit-def: $vgpr76
   ; GCN-NEXT:    ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; implicit-def: $vgpr49
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
-  ; GCN-NEXT:    ; implicit-def: $vgpr51
-  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
-  ; GCN-NEXT:    ; implicit-def: $vgpr76
+  ; GCN-NEXT:    ; implicit-def: $vgpr50
   ; GCN-NEXT:    ; implicit-def: $vgpr77
   ; GCN-NEXT:    ; implicit-def: $vgpr78
   ; GCN-NEXT:    ; implicit-def: $vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr80
-  ; GCN-NEXT:    ; implicit-def: $vgpr91
+  ; GCN-NEXT:    ; implicit-def: $vgpr81
+  ; GCN-NEXT:    ; implicit-def: $vgpr103
   ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v3
+  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v64
   ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
   ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -36,8 +35,9 @@
   ; GCN-NEXT:    s_lshl_b32 s4, s20, 7
   ; GCN-NEXT:    ; implicit-def: $vgpr5
   ; GCN-NEXT:    v_add_lshl_u32 v48, v5, s4, 1
-  ; GCN-NEXT:    v_add_u32_e32 v76, s20, v76
-  ; GCN-NEXT:    v_and_b32_e32 v76, 0x1fffffff, v76
+  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
+  ; GCN-NEXT:    v_add_u32_e32 v77, s20, v77
+  ; GCN-NEXT:    v_and_b32_e32 v77, 0x1fffffff, v77
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b128 v48, v[0:3]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -48,8 +48,8 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   ; GCN-NEXT:    ; implicit-def: $sgpr6
-  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v50
-  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v50
+  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v76
+  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v76
   ; GCN-NEXT:    buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -68,22 +68,22 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
   ; GCN-NEXT:    ; kill: killed $vgpr1
   ; GCN-NEXT:    ; kill: killed $vgpr0
-  ; GCN-NEXT:    v_mul_lo_u32 v76, v76, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v76, v77, v76, 1
-  ; GCN-NEXT:    v_lshl_add_u32 v77, v78, 1, v76
-  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    v_mul_lo_u32 v77, v77, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v77, v78, v77, 1
   ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
   ; GCN-NEXT:    ; implicit-def: $sgpr2
   ; GCN-NEXT:    ; implicit-def: $sgpr3
-  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
+  ; GCN-NEXT:    v_lshl_add_u32 v80, v81, 1, v79
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v51
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v50
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v51 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v50 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
@@ -107,20 +107,20 @@
   ; GCN-NEXT:    ds_read_b128 v[40:43], v49 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v51
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v50
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
   ; GCN-NEXT:    ; implicit-def: $vgpr32
   ; GCN-NEXT:    ; implicit-def: $vgpr33
-  ; GCN-NEXT:    v_add_u32_e32 v82, v32, v50
-  ; GCN-NEXT:    v_add_u32_e32 v83, v33, v50
-  ; GCN-NEXT:    ; kill: killed $vgpr82
+  ; GCN-NEXT:    v_add_u32_e32 v83, v32, v76
+  ; GCN-NEXT:    v_add_u32_e32 v76, v33, v76
   ; GCN-NEXT:    ; kill: killed $vgpr83
+  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[66:69], v51 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[66:69], v50 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
@@ -131,20 +131,20 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
   ; GCN-NEXT:    ; implicit-def: $vgpr66
   ; GCN-NEXT:    ; implicit-def: $vgpr67
-  ; GCN-NEXT:    v_max_f32_e32 v81, v67, v67
+  ; GCN-NEXT:    v_max_f32_e32 v82, v67, v67
   ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
   ; GCN-NEXT:    v_perm_b32 v70, v74, v72, s2
   ; GCN-NEXT:    v_perm_b32 v71, v74, v72, s3
   ; GCN-NEXT:    v_perm_b32 v72, v75, v73, s2
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v76, v70
+  ; GCN-NEXT:    ds_write_b32 v77, v70
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v77, v71
+  ; GCN-NEXT:    ds_write_b32 v78, v71
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v72
+  ; GCN-NEXT:    ds_write_b32 v79, v72
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v20
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
   ; GCN-NEXT:    v_mul_f32_e32 v64, s4, v16
@@ -152,11 +152,11 @@
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
   ; GCN-NEXT:    v_max3_f32 v64, v64, s5, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v21
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v22
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v23
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v24
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v25
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
@@ -166,12 +166,12 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v28
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v29
   ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v30
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v31
   ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v0
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v1
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v80, v84
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v81, v84
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v2
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v3
   ; GCN-NEXT:    v_max3_f32 v64, v64, v85, v86
@@ -179,315 +179,315 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v5
   ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v65
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v7
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v7
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v8
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v9
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v10
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v11
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v12
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v13
   ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
-  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
   ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v68
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
   ; GCN-NEXT:    v_perm_b32 v68, v75, v73, s3
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v68
-  ; GCN-NEXT:    ; implicit-def: $vgpr84
-  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
-  ; GCN-NEXT:    v_max_f32_e32 v70, v64, v65
+  ; GCN-NEXT:    ds_write_b32 v80, v68
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[70:71], v76, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_bpermute_b32 v71, v66, v70
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; implicit-def: $vgpr87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v70, v71, v70, s[0:1]
-  ; GCN-NEXT:    v_max_f32_e32 v70, v70, v70
-  ; GCN-NEXT:    v_max_f32_e32 v72, v81, v70
-  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v72
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v72
-  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v72
+  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[0:1]
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
+  ; GCN-NEXT:    v_max_f32_e32 v65, v82, v64
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v65
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v65
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v65
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v65
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
-  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v72
-  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v72
-  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v72
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v73, v16
-  ; GCN-NEXT:    v_exp_f32_e32 v74, v18
-  ; GCN-NEXT:    v_exp_f32_e32 v75, v19
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v65
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v65
+  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v65
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v17
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v19
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
   ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v73
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v81, v21
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v74
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v82, v22
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v75
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_sub_f32_e32 v24, v67, v72
-  ; GCN-NEXT:    v_exp_f32_e32 v83, v23
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v72
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v24, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v73
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v25, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v81
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v65
   ; GCN-NEXT:    v_exp_f32_e32 v85, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
-  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
-  ; GCN-NEXT:    ds_read_b128 v[18:21], v84
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v82
+  ; GCN-NEXT:    v_pack_b32_f16 v24, v16, v18
+  ; GCN-NEXT:    v_sub_f32_e32 v22, v67, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+  ; GCN-NEXT:    v_pack_b32_f16 v25, v20, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v19
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v16, v24
-  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v86, v23
+  ; GCN-NEXT:    v_exp_f32_e32 v64, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[16:17], v[24:25], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v16, 0, v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v83
+  ; GCN-NEXT:    v_fma_f32 v88, s4, v28, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v84
+  ; GCN-NEXT:    v_fma_f32 v91, s4, v29, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v21
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v87 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
-  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
-  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
-  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v17, v18
-  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v30, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v30, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v31, -v72
-  ; GCN-NEXT:    v_perm_b32 v31, v68, v64, s2
-  ; GCN-NEXT:    v_perm_b32 v64, v68, v64, s3
-  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
-  ; GCN-NEXT:    ds_read_b128 v[26:29], v91
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_perm_b32 v99, v70, v68, s2
+  ; GCN-NEXT:    v_perm_b32 v100, v70, v68, s3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[24:25], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v93, v73, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v94, v85
+  ; GCN-NEXT:    v_fma_f32 v95, s4, v30, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v96, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v97, v86
+  ; GCN-NEXT:    v_fma_f32 v98, s4, v31, -v65
+  ; GCN-NEXT:    v_perm_b32 v101, v71, v69, s2
+  ; GCN-NEXT:    v_perm_b32 v102, v71, v69, s3
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v91 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v76, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v31, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_exp_f32_e32 v31, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_pack_b32_f16 v18, v19, v86
-  ; GCN-NEXT:    v_pack_b32_f16 v19, v22, v89
+  ; GCN-NEXT:    ds_write_b32 v77, v99
+  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v76, v76, v90
+  ; GCN-NEXT:    v_pack_b32_f16 v77, v94, v97
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v77, v64
+  ; GCN-NEXT:    ds_write_b32 v78, v100
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v90
+  ; GCN-NEXT:    ds_write_b32 v79, v101
+  ; GCN-NEXT:    v_mul_f32_e32 v78, 0x3fb8aa3b, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v79, 0x3fb8aa3b, v91
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v81, v81, v93
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v89
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v91, v78
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v78, v92
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v93, v79
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[76:77], v[32:47]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v73
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v87
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v74, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v85
-  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v22, v64
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v88
-  ; GCN-NEXT:    v_exp_f32_e32 v64, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v75, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v30
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v23, v23
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v31
+  ; GCN-NEXT:    ds_write_b32 v80, v102
+  ; GCN-NEXT:    v_mul_f32_e32 v80, 0x3fb8aa3b, v95
+  ; GCN-NEXT:    v_add_f32_e32 v76, v82, v81
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v96
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v80
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v79, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v98
+  ; GCN-NEXT:    v_fma_f32 v81, s4, v3, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v88
   ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v20, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v18, v19
-  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v80, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v22
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v4, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v27, v3
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v64
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v5, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v90, v78
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v77, v79
   ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-  ; GCN-NEXT:    v_add_f32_e32 v17, v81, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v23
-  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v68, v2
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v25
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v83, v76
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v91
+  ; GCN-NEXT:    v_fma_f32 v83, s4, v4, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v90, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v93
+  ; GCN-NEXT:    v_fma_f32 v94, s4, v5, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v88
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v84, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v80
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v82
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v69, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v81
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v84
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v18, v4
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v19
-  ; GCN-NEXT:    v_exp_f32_e32 v24, v24
-  ; GCN-NEXT:    ds_read_b128 v[18:21], v84 offset:576
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v73
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v69
+  ; GCN-NEXT:    ds_read_b128 v[76:79], v87 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v26, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v82, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v27
-  ; GCN-NEXT:    v_exp_f32_e32 v26, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v65
-  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v83
+  ; GCN-NEXT:    v_mul_f32_e32 v81, 0x3fb8aa3b, v94
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[4:5], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v85, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v70, v90
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v71, v69
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v88
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v81
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[4:5], v[32:47]
   ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v83, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v68
-  ; GCN-NEXT:    v_exp_f32_e32 v6, v6
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v24
+  ; GCN-NEXT:    v_add_f32_e32 v68, v86, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v72
+  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v6
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v73
   ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v7
-  ; GCN-NEXT:    v_exp_f32_e32 v7, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v28, v29
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v69
-  ; GCN-NEXT:    ; implicit-def: $sgpr2
-  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_fma_f32 v75, s4, v11, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v70, v69
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v9
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v85, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v4, v88, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v89, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v71
+  ; GCN-NEXT:    v_fma_f32 v70, s4, v12, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v81
+  ; GCN-NEXT:    v_fma_f32 v86, s4, v13, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v87, v8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v76, v92, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v6
-  ; GCN-NEXT:    v_exp_f32_e32 v10, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v17, v28
-  ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v30, v4
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v31, v2
-  ; GCN-NEXT:    v_add_f32_e32 v0, v22, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v64, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v23, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v25, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v27, v0
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
-  ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
-  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
-  ; GCN-NEXT:    v_add_f32_e32 v0, v68, v0
-  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
-  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
-  ; GCN-NEXT:    v_add_f32_e32 v0, v24, v0
-  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
-  ; GCN-NEXT:    v_add_f32_e32 v0, v26, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v67, v0
-  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v5
-  ; GCN-NEXT:    v_add_f32_e32 v0, v6, v0
-  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
-  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v1
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v14
-  ; GCN-NEXT:    v_add_f32_e32 v0, v7, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
-  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
-  ; GCN-NEXT:    v_exp_f32_e32 v18, v1
-  ; GCN-NEXT:    v_add_f32_e32 v6, v8, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v91
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v74
+  ; GCN-NEXT:    v_fma_f32 v77, s4, v14, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v92, v83
+  ; GCN-NEXT:    v_pack_b32_f16 v68, v68, v85
+  ; GCN-NEXT:    v_mul_f32_e32 v75, 0x3fb8aa3b, v75
+  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v70
+  ; GCN-NEXT:    v_pack_b32_f16 v69, v69, v92
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v15, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v75
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v76, v96, v76
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v84
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v86
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v87
+  ; GCN-NEXT:    v_exp_f32_e32 v94, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[68:69], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v67, v67, v76
+  ; GCN-NEXT:    v_add_f32_e32 v67, v91, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v93, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v80, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v82, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v90, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v88, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v72, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v68, 0x3fb8aa3b, v77
+  ; GCN-NEXT:    v_add_f32_e32 v67, v73, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v89
+  ; GCN-NEXT:    v_exp_f32_e32 v78, v68
+  ; GCN-NEXT:    v_add_f32_e32 v67, v71, v67
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
-  ; GCN-NEXT:    v_add_f32_e32 v6, v9, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v13
-  ; GCN-NEXT:    v_add_f32_e32 v6, v10, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v15
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v12
-  ; GCN-NEXT:    v_add_f32_e32 v6, v11, v6
-  ; GCN-NEXT:    v_add_f32_e32 v6, v12, v6
-  ; GCN-NEXT:    v_add_f32_e32 v1, v15, v6
-  ; GCN-NEXT:    v_add_f32_e32 v11, v17, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v0, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v4, v10
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v91 offset:576
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v75
+  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
+  ; GCN-NEXT:    v_add_f32_e32 v67, v81, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v74, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v77, v76, v77
+  ; GCN-NEXT:    v_pack_b32_f16 v76, v85, v86
+  ; GCN-NEXT:    v_add_f32_e32 v67, v83, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v72, v65
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v94
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v78
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v74, v92
+  ; GCN-NEXT:    v_add_f32_e32 v67, v84, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v87, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v89, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v75, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v69, v68, v72
+  ; GCN-NEXT:    v_pack_b32_f16 v68, v74, v73
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v67, v92, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v94, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v78, v67
+  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
+  ; GCN-NEXT:    ds_bpermute_b32 v67, v66, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
+  ; GCN-NEXT:    ds_bpermute_b32 v66, v66, v65
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v18, v11
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    v_mov_b32_e32 v67, 0
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
-  ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
+  ; GCN-NEXT:    v_cndmask_b32_e64 v65, v66, v65, s[0:1]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_fmac_f32_e32 v65, v67, v64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[68:69], v[32:47]
   ; GCN-NEXT:    s_endpgm
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 5ab8706f28f5f..8f8a7ca1d24a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -427,37 +427,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
 ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 4
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -647,37 +647,37 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f32_16x16x16bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 10
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1298,26 +1298,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
@@ -1326,26 +1326,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0)
@@ -1627,26 +1627,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[8:15], v[18:19], v[16:17], v[8:15]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
@@ -1655,26 +1655,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[8:15], v[18:19], v[16:17], v[8:15]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
@@ -1741,26 +1741,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[8:15], v[18:19], v[16:17], v[8:15]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
@@ -1769,26 +1769,26 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[8:15], v[18:19], v[16:17], v[8:15]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index dc4c929124fec..7628ca54ab865 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -2460,6 +2460,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -2480,12 +2481,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 9
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_nop 10
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
@@ -2525,6 +2525,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -2545,12 +2546,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 10
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_nop 11
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
@@ -3607,6 +3607,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3627,12 +3628,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 9
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_nop 10
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
@@ -3672,6 +3672,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3692,12 +3693,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 10
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_nop 11
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
@@ -3910,6 +3910,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3930,12 +3931,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 9
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_nop 10
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
@@ -3975,6 +3975,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -3995,12 +3996,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 10
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_nop 11
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
@@ -4213,6 +4213,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4233,12 +4234,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 9
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_nop 10
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
@@ -4278,6 +4278,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4298,12 +4299,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 10
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_nop 11
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
@@ -4516,6 +4516,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4536,12 +4537,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
 ; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-SDAG-NEXT:    s_nop 9
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_nop 10
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
@@ -4581,6 +4581,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
 ; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
@@ -4601,12 +4602,11 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
 ; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-SDAG-NEXT:    s_nop 10
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_nop 11
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 033a35f69a0bd..e11050ccce746 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -269,28 +269,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; GCN-NEXT:    v_mov_b32_e32 v42, s22
 ; GCN-NEXT:    v_mov_b32_e32 v43, s23
+; GCN-NEXT:    v_mov_b32_e32 v32, s16
+; GCN-NEXT:    v_mov_b32_e32 v33, s17
+; GCN-NEXT:    v_mov_b32_e32 v34, s18
+; GCN-NEXT:    v_mov_b32_e32 v35, s19
 ; GCN-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    v_mov_b32_e32 v16, s16
-; GCN-NEXT:    v_mov_b32_e32 v17, s17
-; GCN-NEXT:    v_mov_b32_e32 v18, s18
-; GCN-NEXT:    v_mov_b32_e32 v19, s19
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-NEXT:    v_mov_b32_e32 v17, s13
-; GCN-NEXT:    v_mov_b32_e32 v18, s14
-; GCN-NEXT:    v_mov_b32_e32 v19, s15
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s12
+; GCN-NEXT:    v_mov_b32_e32 v33, s13
+; GCN-NEXT:    v_mov_b32_e32 v34, s14
+; GCN-NEXT:    v_mov_b32_e32 v35, s15
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s8
-; GCN-NEXT:    v_mov_b32_e32 v17, s9
-; GCN-NEXT:    v_mov_b32_e32 v18, s10
-; GCN-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s8
+; GCN-NEXT:    v_mov_b32_e32 v33, s9
+; GCN-NEXT:    v_mov_b32_e32 v34, s10
+; GCN-NEXT:    v_mov_b32_e32 v35, s11
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -332,28 +331,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v42, s22
 ; GCN-NEXT:    v_mov_b32_e32 v43, s23
+; GCN-NEXT:    v_mov_b32_e32 v32, s16
+; GCN-NEXT:    v_mov_b32_e32 v33, s17
+; GCN-NEXT:    v_mov_b32_e32 v34, s18
+; GCN-NEXT:    v_mov_b32_e32 v35, s19
 ; GCN-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    v_mov_b32_e32 v16, s16
-; GCN-NEXT:    v_mov_b32_e32 v17, s17
-; GCN-NEXT:    v_mov_b32_e32 v18, s18
-; GCN-NEXT:    v_mov_b32_e32 v19, s19
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-NEXT:    v_mov_b32_e32 v17, s13
-; GCN-NEXT:    v_mov_b32_e32 v18, s14
-; GCN-NEXT:    v_mov_b32_e32 v19, s15
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s12
+; GCN-NEXT:    v_mov_b32_e32 v33, s13
+; GCN-NEXT:    v_mov_b32_e32 v34, s14
+; GCN-NEXT:    v_mov_b32_e32 v35, s15
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s8
-; GCN-NEXT:    v_mov_b32_e32 v17, s9
-; GCN-NEXT:    v_mov_b32_e32 v18, s10
-; GCN-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s8
+; GCN-NEXT:    v_mov_b32_e32 v33, s9
+; GCN-NEXT:    v_mov_b32_e32 v34, s10
+; GCN-NEXT:    v_mov_b32_e32 v35, s11
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 753206206180a..68bed43e6f410 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1508,28 +1508,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; SDAG-NEXT:    v_mov_b32_e32 v42, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v43, s23
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1611,28 +1610,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; HEURRC-NEXT:    v_mov_b32_e32 v42, s22
 ; HEURRC-NEXT:    v_mov_b32_e32 v43, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 2
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -1668,28 +1666,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; VGPRRC-NEXT:    v_mov_b32_e32 v42, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 2
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -1850,28 +1847,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; SDAG-NEXT:    v_mov_b32_e32 v42, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v43, s23
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1953,28 +1949,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; HEURRC-NEXT:    v_mov_b32_e32 v42, s22
 ; HEURRC-NEXT:    v_mov_b32_e32 v43, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 2
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -2010,28 +2005,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; VGPRRC-NEXT:    v_mov_b32_e32 v42, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 2
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -3186,18 +3180,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31]
-; VGPRRC-NEXT:    s_nop 11
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], 16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], 0
+; VGPRRC-NEXT:    s_nop 9
 ; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3218,14 +3210,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3598,18 +3590,16 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1
-; VGPRRC-NEXT:    s_nop 11
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], 16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], 0
+; VGPRRC-NEXT:    s_nop 9
 ; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3630,14 +3620,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
@@ -4150,33 +4140,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4260,33 +4249,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; HEURRC-NEXT:    s_nop 6
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4324,33 +4312,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT:    s_nop 6
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4527,33 +4514,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4637,33 +4623,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT:    s_nop 6
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4701,33 +4686,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT:    s_nop 6
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 7e30af96bb8b9..aa670dce4e6f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -799,17 +799,17 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1155,17 +1155,17 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 9
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -2005,21 +2005,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -2395,21 +2395,21 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -3304,17 +3304,17 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
@@ -3494,19 +3494,19 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1)
 ;
 ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
 ; GFX942-VGPR:       ; %bb.0:
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x41
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3)
@@ -4309,7 +4309,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4318,9 +4318,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v6, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -5017,12 +5017,12 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v0, v1, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -5542,6 +5542,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v1
@@ -5570,39 +5572,37 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[30:31]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[30:31], v[28:29]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[28:29], v[26:27]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[26:27], v[24:25]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[24:25], v[22:23]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[22:23], v[20:21]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[20:21], v[18:19]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], v[16:17]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[62:63], v[30:31]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v64, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[60:61], v[28:29]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[58:59], v[26:27]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[56:57], v[24:25]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[54:55], v[22:23]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[52:53], v[20:21]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[50:51], v[18:19]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[48:49], v[16:17]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[46:47], v[14:15]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[44:45], v[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[42:43], v[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[40:41], v[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[38:39], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[36:37], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[34:35], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[0:1]
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[32:63], v0, v64, v[32:63]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[60:63], s[0:1] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[56:59], s[0:1] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[52:55], s[0:1] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[48:51], s[0:1] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[44:47], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[40:43], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[36:39], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[32:35], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
@@ -5695,20 +5695,20 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -5804,19 +5804,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index f0205a3a788ed..a8d2f64c3c4d9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -5093,43 +5093,42 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; SDAG-NEXT:    s_nop 14
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[20:21], 48
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[36:37], 48
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[24:25], 16
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[38:39], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5137,6 +5136,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
@@ -5154,28 +5156,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], 48
+; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
@@ -5190,71 +5197,71 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x0
 ; SDAG-NEXT:    v_mov_b32_e32 v32, 42
 ; SDAG-NEXT:    v_mov_b32_e32 v33, 25
+; SDAG-NEXT:    v_mov_b64_e32 v[36:37], 48
+; SDAG-NEXT:    v_mov_b64_e32 v[38:39], 32
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    v_mov_b32_e32 v20, s16
-; SDAG-NEXT:    v_mov_b32_e32 v21, s17
-; SDAG-NEXT:    v_mov_b32_e32 v22, s18
-; SDAG-NEXT:    v_mov_b32_e32 v23, s19
-; SDAG-NEXT:    v_mov_b32_e32 v24, s20
-; SDAG-NEXT:    v_mov_b32_e32 v25, s21
-; SDAG-NEXT:    v_mov_b32_e32 v26, s22
-; SDAG-NEXT:    v_mov_b32_e32 v27, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s12
+; SDAG-NEXT:    v_mov_b32_e32 v1, s13
+; SDAG-NEXT:    v_mov_b32_e32 v2, s14
+; SDAG-NEXT:    v_mov_b32_e32 v3, s15
+; SDAG-NEXT:    v_mov_b32_e32 v4, s16
+; SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; SDAG-NEXT:    v_mov_b32_e32 v6, s18
+; SDAG-NEXT:    v_mov_b32_e32 v7, s19
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
+; SDAG-NEXT:    v_mov_b32_e32 v9, s21
+; SDAG-NEXT:    v_mov_b32_e32 v10, s22
+; SDAG-NEXT:    v_mov_b32_e32 v11, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v28, s24
-; SDAG-NEXT:    v_mov_b32_e32 v29, s25
-; SDAG-NEXT:    v_mov_b32_e32 v30, s26
-; SDAG-NEXT:    v_mov_b32_e32 v31, s27
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v13, s25
+; SDAG-NEXT:    v_mov_b32_e32 v14, s26
+; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[20:21], 48
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[24:25], 16
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
+; SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5264,52 +5271,52 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; GISEL-NEXT:    v_mov_b32_e32 v32, 25
 ; GISEL-NEXT:    v_mov_b32_e32 v33, 42
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[36:37]
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[38:39]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[40:41]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[42:43]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[44:45]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[46:47]
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[48:49]
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[50:51]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[44:45]
 ; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
 ; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[18:19]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], 48
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 2
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 5475fa2ae5c6e..ef3bb0cb5f4f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-SDAG-NEXT:    s_nop 1
-; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    s_nop 6
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32_vgprcd:
@@ -87,14 +87,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, 4.0
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-GISEL-NEXT:    s_nop 1
-; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-GISEL-NEXT:    s_nop 5
-; GFX942-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    s_nop 6
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 6eb9449069a52..1dbf0b5862fa3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -120,25 +120,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; SDAG-NEXT:    v_mov_b32_e32 v11, s1
-; SDAG-NEXT:    v_mov_b32_e32 v12, s2
-; SDAG-NEXT:    v_mov_b32_e32 v13, s3
-; SDAG-NEXT:    v_mov_b32_e32 v2, s16
-; SDAG-NEXT:    v_mov_b32_e32 v3, s17
-; SDAG-NEXT:    v_mov_b32_e32 v4, s18
-; SDAG-NEXT:    v_mov_b32_e32 v5, s19
-; SDAG-NEXT:    v_mov_b32_e32 v6, s20
-; SDAG-NEXT:    v_mov_b32_e32 v7, s21
-; SDAG-NEXT:    v_mov_b32_e32 v8, s22
-; SDAG-NEXT:    v_mov_b32_e32 v9, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v0, s28
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[10:13], v[2:9], v0
+; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[6:13], v4
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -250,24 +250,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16:
@@ -312,24 +329,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
@@ -374,24 +408,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
@@ -436,53 +487,53 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v48, s0
+; SDAG-NEXT:    v_mov_b32_e32 v49, s1
+; SDAG-NEXT:    v_mov_b32_e32 v50, s2
+; SDAG-NEXT:    v_mov_b32_e32 v51, s3
+; SDAG-NEXT:    v_mov_b32_e32 v21, s29
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    v_mov_b32_e32 v36, s20
+; SDAG-NEXT:    v_mov_b32_e32 v37, s21
+; SDAG-NEXT:    v_mov_b32_e32 v38, s22
+; SDAG-NEXT:    v_mov_b32_e32 v39, s23
+; SDAG-NEXT:    v_mov_b32_e32 v16, s24
+; SDAG-NEXT:    v_mov_b32_e32 v17, s25
+; SDAG-NEXT:    v_mov_b32_e32 v18, s26
+; SDAG-NEXT:    v_mov_b32_e32 v19, s27
+; SDAG-NEXT:    v_mov_b32_e32 v20, s28
+; SDAG-NEXT:    v_mov_b32_e32 v22, v0
+; SDAG-NEXT:    v_mov_b32_e32 v23, v1
+; SDAG-NEXT:    v_mov_b32_e32 v24, v2
+; SDAG-NEXT:    v_mov_b32_e32 v25, v3
+; SDAG-NEXT:    v_mov_b32_e32 v26, v4
+; SDAG-NEXT:    v_mov_b32_e32 v27, v5
+; SDAG-NEXT:    v_mov_b32_e32 v28, v6
+; SDAG-NEXT:    v_mov_b32_e32 v29, v7
+; SDAG-NEXT:    v_mov_b32_e32 v30, v8
+; SDAG-NEXT:    v_mov_b32_e32 v31, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[16:31], v[48:51], v[32:39], v10
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v18
+; SDAG-NEXT:    v_mov_b32_e32 v3, v19
+; SDAG-NEXT:    v_mov_b32_e32 v4, v20
+; SDAG-NEXT:    v_mov_b32_e32 v5, v21
+; SDAG-NEXT:    v_mov_b32_e32 v6, v22
+; SDAG-NEXT:    v_mov_b32_e32 v7, v23
+; SDAG-NEXT:    v_mov_b32_e32 v8, v24
+; SDAG-NEXT:    v_mov_b32_e32 v9, v25
+; SDAG-NEXT:    v_mov_b32_e32 v10, v26
+; SDAG-NEXT:    v_mov_b32_e32 v11, v27
+; SDAG-NEXT:    v_mov_b32_e32 v12, v28
+; SDAG-NEXT:    v_mov_b32_e32 v13, v29
+; SDAG-NEXT:    v_mov_b32_e32 v14, v30
+; SDAG-NEXT:    v_mov_b32_e32 v15, v31
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
@@ -618,25 +669,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v10, s0
-; GCN-NEXT:    v_mov_b32_e32 v11, s1
-; GCN-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, s16
-; GCN-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NEXT:    v_mov_b32_e32 v4, s18
-; GCN-NEXT:    v_mov_b32_e32 v5, s19
-; GCN-NEXT:    v_mov_b32_e32 v6, s20
-; GCN-NEXT:    v_mov_b32_e32 v7, s21
-; GCN-NEXT:    v_mov_b32_e32 v8, s22
-; GCN-NEXT:    v_mov_b32_e32 v9, s23
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    v_mov_b32_e32 v6, s16
+; GCN-NEXT:    v_mov_b32_e32 v7, s17
+; GCN-NEXT:    v_mov_b32_e32 v8, s18
+; GCN-NEXT:    v_mov_b32_e32 v9, s19
+; GCN-NEXT:    v_mov_b32_e32 v10, s20
+; GCN-NEXT:    v_mov_b32_e32 v11, s21
+; GCN-NEXT:    v_mov_b32_e32 v12, s22
+; GCN-NEXT:    v_mov_b32_e32 v13, s23
 ; GCN-NEXT:    v_accvgpr_write_b32 a0, s24
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, s25
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, s26
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, s27
-; GCN-NEXT:    v_mov_b32_e32 v0, s28
+; GCN-NEXT:    v_mov_b32_e32 v4, s28
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[10:13], v[2:9], v0
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[6:13], v4
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -698,24 +749,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
 ; GCN-NEXT:    s_nop 11
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -725,24 +793,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; GCN-NEXT:    s_nop 11
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
@@ -752,24 +837,41 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; GCN-NEXT:    s_nop 11
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
@@ -779,53 +881,53 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v36, s0
-; GCN-NEXT:    v_mov_b32_e32 v37, s1
-; GCN-NEXT:    v_mov_b32_e32 v38, s2
-; GCN-NEXT:    v_mov_b32_e32 v39, s3
-; GCN-NEXT:    v_mov_b32_e32 v13, s25
-; GCN-NEXT:    v_mov_b32_e32 v14, s26
-; GCN-NEXT:    v_mov_b32_e32 v15, s27
-; GCN-NEXT:    v_mov_b32_e32 v16, s28
-; GCN-NEXT:    v_mov_b32_e32 v17, s29
-; GCN-NEXT:    v_mov_b32_e32 v28, s16
-; GCN-NEXT:    v_mov_b32_e32 v29, s17
-; GCN-NEXT:    v_mov_b32_e32 v30, s18
-; GCN-NEXT:    v_mov_b32_e32 v31, s19
-; GCN-NEXT:    v_mov_b32_e32 v32, s20
-; GCN-NEXT:    v_mov_b32_e32 v33, s21
-; GCN-NEXT:    v_mov_b32_e32 v34, s22
-; GCN-NEXT:    v_mov_b32_e32 v35, s23
-; GCN-NEXT:    v_mov_b32_e32 v12, s24
-; GCN-NEXT:    v_mov_b32_e32 v18, v0
-; GCN-NEXT:    v_mov_b32_e32 v19, v1
-; GCN-NEXT:    v_mov_b32_e32 v20, v2
-; GCN-NEXT:    v_mov_b32_e32 v21, v3
-; GCN-NEXT:    v_mov_b32_e32 v22, v4
-; GCN-NEXT:    v_mov_b32_e32 v23, v5
-; GCN-NEXT:    v_mov_b32_e32 v24, v6
-; GCN-NEXT:    v_mov_b32_e32 v25, v7
-; GCN-NEXT:    v_mov_b32_e32 v26, v8
-; GCN-NEXT:    v_mov_b32_e32 v27, v9
+; GCN-NEXT:    v_mov_b32_e32 v48, s0
+; GCN-NEXT:    v_mov_b32_e32 v49, s1
+; GCN-NEXT:    v_mov_b32_e32 v50, s2
+; GCN-NEXT:    v_mov_b32_e32 v51, s3
+; GCN-NEXT:    v_mov_b32_e32 v21, s29
+; GCN-NEXT:    v_mov_b32_e32 v32, s16
+; GCN-NEXT:    v_mov_b32_e32 v33, s17
+; GCN-NEXT:    v_mov_b32_e32 v34, s18
+; GCN-NEXT:    v_mov_b32_e32 v35, s19
+; GCN-NEXT:    v_mov_b32_e32 v36, s20
+; GCN-NEXT:    v_mov_b32_e32 v37, s21
+; GCN-NEXT:    v_mov_b32_e32 v38, s22
+; GCN-NEXT:    v_mov_b32_e32 v39, s23
+; GCN-NEXT:    v_mov_b32_e32 v16, s24
+; GCN-NEXT:    v_mov_b32_e32 v17, s25
+; GCN-NEXT:    v_mov_b32_e32 v18, s26
+; GCN-NEXT:    v_mov_b32_e32 v19, s27
+; GCN-NEXT:    v_mov_b32_e32 v20, s28
+; GCN-NEXT:    v_mov_b32_e32 v22, v0
+; GCN-NEXT:    v_mov_b32_e32 v23, v1
+; GCN-NEXT:    v_mov_b32_e32 v24, v2
+; GCN-NEXT:    v_mov_b32_e32 v25, v3
+; GCN-NEXT:    v_mov_b32_e32 v26, v4
+; GCN-NEXT:    v_mov_b32_e32 v27, v5
+; GCN-NEXT:    v_mov_b32_e32 v28, v6
+; GCN-NEXT:    v_mov_b32_e32 v29, v7
+; GCN-NEXT:    v_mov_b32_e32 v30, v8
+; GCN-NEXT:    v_mov_b32_e32 v31, v9
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[16:31], v[48:51], v[32:39], v10
 ; GCN-NEXT:    s_nop 11
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
+; GCN-NEXT:    v_mov_b32_e32 v4, v20
+; GCN-NEXT:    v_mov_b32_e32 v5, v21
+; GCN-NEXT:    v_mov_b32_e32 v6, v22
+; GCN-NEXT:    v_mov_b32_e32 v7, v23
+; GCN-NEXT:    v_mov_b32_e32 v8, v24
+; GCN-NEXT:    v_mov_b32_e32 v9, v25
+; GCN-NEXT:    v_mov_b32_e32 v10, v26
+; GCN-NEXT:    v_mov_b32_e32 v11, v27
+; GCN-NEXT:    v_mov_b32_e32 v12, v28
+; GCN-NEXT:    v_mov_b32_e32 v13, v29
+; GCN-NEXT:    v_mov_b32_e32 v14, v30
+; GCN-NEXT:    v_mov_b32_e32 v15, v31
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -953,25 +1055,25 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; SDAG-NEXT:    v_mov_b32_e32 v11, s1
-; SDAG-NEXT:    v_mov_b32_e32 v12, s2
-; SDAG-NEXT:    v_mov_b32_e32 v13, s3
-; SDAG-NEXT:    v_mov_b32_e32 v2, s16
-; SDAG-NEXT:    v_mov_b32_e32 v3, s17
-; SDAG-NEXT:    v_mov_b32_e32 v4, s18
-; SDAG-NEXT:    v_mov_b32_e32 v5, s19
-; SDAG-NEXT:    v_mov_b32_e32 v6, s20
-; SDAG-NEXT:    v_mov_b32_e32 v7, s21
-; SDAG-NEXT:    v_mov_b32_e32 v8, s22
-; SDAG-NEXT:    v_mov_b32_e32 v9, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v0, s28
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[10:13], v[2:9], v0
+; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[6:13], v4
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1089,24 +1191,41 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8:
@@ -1151,24 +1270,41 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
@@ -1213,24 +1349,41 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
@@ -1275,53 +1428,53 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v48, s0
+; SDAG-NEXT:    v_mov_b32_e32 v49, s1
+; SDAG-NEXT:    v_mov_b32_e32 v50, s2
+; SDAG-NEXT:    v_mov_b32_e32 v51, s3
+; SDAG-NEXT:    v_mov_b32_e32 v21, s29
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    v_mov_b32_e32 v36, s20
+; SDAG-NEXT:    v_mov_b32_e32 v37, s21
+; SDAG-NEXT:    v_mov_b32_e32 v38, s22
+; SDAG-NEXT:    v_mov_b32_e32 v39, s23
+; SDAG-NEXT:    v_mov_b32_e32 v16, s24
+; SDAG-NEXT:    v_mov_b32_e32 v17, s25
+; SDAG-NEXT:    v_mov_b32_e32 v18, s26
+; SDAG-NEXT:    v_mov_b32_e32 v19, s27
+; SDAG-NEXT:    v_mov_b32_e32 v20, s28
+; SDAG-NEXT:    v_mov_b32_e32 v22, v0
+; SDAG-NEXT:    v_mov_b32_e32 v23, v1
+; SDAG-NEXT:    v_mov_b32_e32 v24, v2
+; SDAG-NEXT:    v_mov_b32_e32 v25, v3
+; SDAG-NEXT:    v_mov_b32_e32 v26, v4
+; SDAG-NEXT:    v_mov_b32_e32 v27, v5
+; SDAG-NEXT:    v_mov_b32_e32 v28, v6
+; SDAG-NEXT:    v_mov_b32_e32 v29, v7
+; SDAG-NEXT:    v_mov_b32_e32 v30, v8
+; SDAG-NEXT:    v_mov_b32_e32 v31, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[16:31], v[48:51], v[32:39], v10
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v18
+; SDAG-NEXT:    v_mov_b32_e32 v3, v19
+; SDAG-NEXT:    v_mov_b32_e32 v4, v20
+; SDAG-NEXT:    v_mov_b32_e32 v5, v21
+; SDAG-NEXT:    v_mov_b32_e32 v6, v22
+; SDAG-NEXT:    v_mov_b32_e32 v7, v23
+; SDAG-NEXT:    v_mov_b32_e32 v8, v24
+; SDAG-NEXT:    v_mov_b32_e32 v9, v25
+; SDAG-NEXT:    v_mov_b32_e32 v10, v26
+; SDAG-NEXT:    v_mov_b32_e32 v11, v27
+; SDAG-NEXT:    v_mov_b32_e32 v12, v28
+; SDAG-NEXT:    v_mov_b32_e32 v13, v29
+; SDAG-NEXT:    v_mov_b32_e32 v14, v30
+; SDAG-NEXT:    v_mov_b32_e32 v15, v31
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
@@ -1489,25 +1642,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; SDAG-NEXT:    v_mov_b32_e32 v11, s1
-; SDAG-NEXT:    v_mov_b32_e32 v12, s2
-; SDAG-NEXT:    v_mov_b32_e32 v13, s3
-; SDAG-NEXT:    v_mov_b32_e32 v2, s16
-; SDAG-NEXT:    v_mov_b32_e32 v3, s17
-; SDAG-NEXT:    v_mov_b32_e32 v4, s18
-; SDAG-NEXT:    v_mov_b32_e32 v5, s19
-; SDAG-NEXT:    v_mov_b32_e32 v6, s20
-; SDAG-NEXT:    v_mov_b32_e32 v7, s21
-; SDAG-NEXT:    v_mov_b32_e32 v8, s22
-; SDAG-NEXT:    v_mov_b32_e32 v9, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v0, s28
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[10:13], v[2:9], v0
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[6:13], v4
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1658,25 +1811,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; SDAG-NEXT:    v_mov_b32_e32 v11, s1
-; SDAG-NEXT:    v_mov_b32_e32 v12, s2
-; SDAG-NEXT:    v_mov_b32_e32 v13, s3
-; SDAG-NEXT:    v_mov_b32_e32 v2, s16
-; SDAG-NEXT:    v_mov_b32_e32 v3, s17
-; SDAG-NEXT:    v_mov_b32_e32 v4, s18
-; SDAG-NEXT:    v_mov_b32_e32 v5, s19
-; SDAG-NEXT:    v_mov_b32_e32 v6, s20
-; SDAG-NEXT:    v_mov_b32_e32 v7, s21
-; SDAG-NEXT:    v_mov_b32_e32 v8, s22
-; SDAG-NEXT:    v_mov_b32_e32 v9, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v0, s28
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[10:13], v[2:9], v0
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[6:13], v4
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1827,25 +1980,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; SDAG-NEXT:    v_mov_b32_e32 v11, s1
-; SDAG-NEXT:    v_mov_b32_e32 v12, s2
-; SDAG-NEXT:    v_mov_b32_e32 v13, s3
-; SDAG-NEXT:    v_mov_b32_e32 v2, s16
-; SDAG-NEXT:    v_mov_b32_e32 v3, s17
-; SDAG-NEXT:    v_mov_b32_e32 v4, s18
-; SDAG-NEXT:    v_mov_b32_e32 v5, s19
-; SDAG-NEXT:    v_mov_b32_e32 v6, s20
-; SDAG-NEXT:    v_mov_b32_e32 v7, s21
-; SDAG-NEXT:    v_mov_b32_e32 v8, s22
-; SDAG-NEXT:    v_mov_b32_e32 v9, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v0, s28
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[10:13], v[2:9], v0
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[6:13], v4
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1996,25 +2149,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; SDAG-NEXT:    v_mov_b32_e32 v11, s1
-; SDAG-NEXT:    v_mov_b32_e32 v12, s2
-; SDAG-NEXT:    v_mov_b32_e32 v13, s3
-; SDAG-NEXT:    v_mov_b32_e32 v2, s16
-; SDAG-NEXT:    v_mov_b32_e32 v3, s17
-; SDAG-NEXT:    v_mov_b32_e32 v4, s18
-; SDAG-NEXT:    v_mov_b32_e32 v5, s19
-; SDAG-NEXT:    v_mov_b32_e32 v6, s20
-; SDAG-NEXT:    v_mov_b32_e32 v7, s21
-; SDAG-NEXT:    v_mov_b32_e32 v8, s22
-; SDAG-NEXT:    v_mov_b32_e32 v9, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; SDAG-NEXT:    v_mov_b32_e32 v3, s3
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v0, s28
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[10:13], v[2:9], v0
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[6:13], v4
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -2132,24 +2285,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
@@ -2194,24 +2364,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
@@ -2256,24 +2443,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
@@ -2318,53 +2522,53 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v48, s0
+; SDAG-NEXT:    v_mov_b32_e32 v49, s1
+; SDAG-NEXT:    v_mov_b32_e32 v50, s2
+; SDAG-NEXT:    v_mov_b32_e32 v51, s3
+; SDAG-NEXT:    v_mov_b32_e32 v21, s29
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    v_mov_b32_e32 v36, s20
+; SDAG-NEXT:    v_mov_b32_e32 v37, s21
+; SDAG-NEXT:    v_mov_b32_e32 v38, s22
+; SDAG-NEXT:    v_mov_b32_e32 v39, s23
+; SDAG-NEXT:    v_mov_b32_e32 v16, s24
+; SDAG-NEXT:    v_mov_b32_e32 v17, s25
+; SDAG-NEXT:    v_mov_b32_e32 v18, s26
+; SDAG-NEXT:    v_mov_b32_e32 v19, s27
+; SDAG-NEXT:    v_mov_b32_e32 v20, s28
+; SDAG-NEXT:    v_mov_b32_e32 v22, v0
+; SDAG-NEXT:    v_mov_b32_e32 v23, v1
+; SDAG-NEXT:    v_mov_b32_e32 v24, v2
+; SDAG-NEXT:    v_mov_b32_e32 v25, v3
+; SDAG-NEXT:    v_mov_b32_e32 v26, v4
+; SDAG-NEXT:    v_mov_b32_e32 v27, v5
+; SDAG-NEXT:    v_mov_b32_e32 v28, v6
+; SDAG-NEXT:    v_mov_b32_e32 v29, v7
+; SDAG-NEXT:    v_mov_b32_e32 v30, v8
+; SDAG-NEXT:    v_mov_b32_e32 v31, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[16:31], v[48:51], v[32:39], v10
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v18
+; SDAG-NEXT:    v_mov_b32_e32 v3, v19
+; SDAG-NEXT:    v_mov_b32_e32 v4, v20
+; SDAG-NEXT:    v_mov_b32_e32 v5, v21
+; SDAG-NEXT:    v_mov_b32_e32 v6, v22
+; SDAG-NEXT:    v_mov_b32_e32 v7, v23
+; SDAG-NEXT:    v_mov_b32_e32 v8, v24
+; SDAG-NEXT:    v_mov_b32_e32 v9, v25
+; SDAG-NEXT:    v_mov_b32_e32 v10, v26
+; SDAG-NEXT:    v_mov_b32_e32 v11, v27
+; SDAG-NEXT:    v_mov_b32_e32 v12, v28
+; SDAG-NEXT:    v_mov_b32_e32 v13, v29
+; SDAG-NEXT:    v_mov_b32_e32 v14, v30
+; SDAG-NEXT:    v_mov_b32_e32 v15, v31
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
@@ -2499,24 +2703,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
@@ -2561,24 +2782,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
@@ -2623,24 +2861,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
@@ -2685,53 +2940,53 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v48, s0
+; SDAG-NEXT:    v_mov_b32_e32 v49, s1
+; SDAG-NEXT:    v_mov_b32_e32 v50, s2
+; SDAG-NEXT:    v_mov_b32_e32 v51, s3
+; SDAG-NEXT:    v_mov_b32_e32 v21, s29
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    v_mov_b32_e32 v36, s20
+; SDAG-NEXT:    v_mov_b32_e32 v37, s21
+; SDAG-NEXT:    v_mov_b32_e32 v38, s22
+; SDAG-NEXT:    v_mov_b32_e32 v39, s23
+; SDAG-NEXT:    v_mov_b32_e32 v16, s24
+; SDAG-NEXT:    v_mov_b32_e32 v17, s25
+; SDAG-NEXT:    v_mov_b32_e32 v18, s26
+; SDAG-NEXT:    v_mov_b32_e32 v19, s27
+; SDAG-NEXT:    v_mov_b32_e32 v20, s28
+; SDAG-NEXT:    v_mov_b32_e32 v22, v0
+; SDAG-NEXT:    v_mov_b32_e32 v23, v1
+; SDAG-NEXT:    v_mov_b32_e32 v24, v2
+; SDAG-NEXT:    v_mov_b32_e32 v25, v3
+; SDAG-NEXT:    v_mov_b32_e32 v26, v4
+; SDAG-NEXT:    v_mov_b32_e32 v27, v5
+; SDAG-NEXT:    v_mov_b32_e32 v28, v6
+; SDAG-NEXT:    v_mov_b32_e32 v29, v7
+; SDAG-NEXT:    v_mov_b32_e32 v30, v8
+; SDAG-NEXT:    v_mov_b32_e32 v31, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[16:31], v[48:51], v[32:39], v10
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v18
+; SDAG-NEXT:    v_mov_b32_e32 v3, v19
+; SDAG-NEXT:    v_mov_b32_e32 v4, v20
+; SDAG-NEXT:    v_mov_b32_e32 v5, v21
+; SDAG-NEXT:    v_mov_b32_e32 v6, v22
+; SDAG-NEXT:    v_mov_b32_e32 v7, v23
+; SDAG-NEXT:    v_mov_b32_e32 v8, v24
+; SDAG-NEXT:    v_mov_b32_e32 v9, v25
+; SDAG-NEXT:    v_mov_b32_e32 v10, v26
+; SDAG-NEXT:    v_mov_b32_e32 v11, v27
+; SDAG-NEXT:    v_mov_b32_e32 v12, v28
+; SDAG-NEXT:    v_mov_b32_e32 v13, v29
+; SDAG-NEXT:    v_mov_b32_e32 v14, v30
+; SDAG-NEXT:    v_mov_b32_e32 v15, v31
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
@@ -2866,24 +3121,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
@@ -2928,24 +3200,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
@@ -2990,24 +3279,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
@@ -3052,53 +3358,53 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v48, s0
+; SDAG-NEXT:    v_mov_b32_e32 v49, s1
+; SDAG-NEXT:    v_mov_b32_e32 v50, s2
+; SDAG-NEXT:    v_mov_b32_e32 v51, s3
+; SDAG-NEXT:    v_mov_b32_e32 v21, s29
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    v_mov_b32_e32 v36, s20
+; SDAG-NEXT:    v_mov_b32_e32 v37, s21
+; SDAG-NEXT:    v_mov_b32_e32 v38, s22
+; SDAG-NEXT:    v_mov_b32_e32 v39, s23
+; SDAG-NEXT:    v_mov_b32_e32 v16, s24
+; SDAG-NEXT:    v_mov_b32_e32 v17, s25
+; SDAG-NEXT:    v_mov_b32_e32 v18, s26
+; SDAG-NEXT:    v_mov_b32_e32 v19, s27
+; SDAG-NEXT:    v_mov_b32_e32 v20, s28
+; SDAG-NEXT:    v_mov_b32_e32 v22, v0
+; SDAG-NEXT:    v_mov_b32_e32 v23, v1
+; SDAG-NEXT:    v_mov_b32_e32 v24, v2
+; SDAG-NEXT:    v_mov_b32_e32 v25, v3
+; SDAG-NEXT:    v_mov_b32_e32 v26, v4
+; SDAG-NEXT:    v_mov_b32_e32 v27, v5
+; SDAG-NEXT:    v_mov_b32_e32 v28, v6
+; SDAG-NEXT:    v_mov_b32_e32 v29, v7
+; SDAG-NEXT:    v_mov_b32_e32 v30, v8
+; SDAG-NEXT:    v_mov_b32_e32 v31, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[16:31], v[48:51], v[32:39], v10
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v18
+; SDAG-NEXT:    v_mov_b32_e32 v3, v19
+; SDAG-NEXT:    v_mov_b32_e32 v4, v20
+; SDAG-NEXT:    v_mov_b32_e32 v5, v21
+; SDAG-NEXT:    v_mov_b32_e32 v6, v22
+; SDAG-NEXT:    v_mov_b32_e32 v7, v23
+; SDAG-NEXT:    v_mov_b32_e32 v8, v24
+; SDAG-NEXT:    v_mov_b32_e32 v9, v25
+; SDAG-NEXT:    v_mov_b32_e32 v10, v26
+; SDAG-NEXT:    v_mov_b32_e32 v11, v27
+; SDAG-NEXT:    v_mov_b32_e32 v12, v28
+; SDAG-NEXT:    v_mov_b32_e32 v13, v29
+; SDAG-NEXT:    v_mov_b32_e32 v14, v30
+; SDAG-NEXT:    v_mov_b32_e32 v15, v31
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
@@ -3233,24 +3539,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
@@ -3295,24 +3618,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
@@ -3357,24 +3697,41 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
@@ -3419,53 +3776,53 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v48, s0
+; SDAG-NEXT:    v_mov_b32_e32 v49, s1
+; SDAG-NEXT:    v_mov_b32_e32 v50, s2
+; SDAG-NEXT:    v_mov_b32_e32 v51, s3
+; SDAG-NEXT:    v_mov_b32_e32 v21, s29
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    v_mov_b32_e32 v36, s20
+; SDAG-NEXT:    v_mov_b32_e32 v37, s21
+; SDAG-NEXT:    v_mov_b32_e32 v38, s22
+; SDAG-NEXT:    v_mov_b32_e32 v39, s23
+; SDAG-NEXT:    v_mov_b32_e32 v16, s24
+; SDAG-NEXT:    v_mov_b32_e32 v17, s25
+; SDAG-NEXT:    v_mov_b32_e32 v18, s26
+; SDAG-NEXT:    v_mov_b32_e32 v19, s27
+; SDAG-NEXT:    v_mov_b32_e32 v20, s28
+; SDAG-NEXT:    v_mov_b32_e32 v22, v0
+; SDAG-NEXT:    v_mov_b32_e32 v23, v1
+; SDAG-NEXT:    v_mov_b32_e32 v24, v2
+; SDAG-NEXT:    v_mov_b32_e32 v25, v3
+; SDAG-NEXT:    v_mov_b32_e32 v26, v4
+; SDAG-NEXT:    v_mov_b32_e32 v27, v5
+; SDAG-NEXT:    v_mov_b32_e32 v28, v6
+; SDAG-NEXT:    v_mov_b32_e32 v29, v7
+; SDAG-NEXT:    v_mov_b32_e32 v30, v8
+; SDAG-NEXT:    v_mov_b32_e32 v31, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[16:31], v[48:51], v[32:39], v10
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v18
+; SDAG-NEXT:    v_mov_b32_e32 v3, v19
+; SDAG-NEXT:    v_mov_b32_e32 v4, v20
+; SDAG-NEXT:    v_mov_b32_e32 v5, v21
+; SDAG-NEXT:    v_mov_b32_e32 v6, v22
+; SDAG-NEXT:    v_mov_b32_e32 v7, v23
+; SDAG-NEXT:    v_mov_b32_e32 v8, v24
+; SDAG-NEXT:    v_mov_b32_e32 v9, v25
+; SDAG-NEXT:    v_mov_b32_e32 v10, v26
+; SDAG-NEXT:    v_mov_b32_e32 v11, v27
+; SDAG-NEXT:    v_mov_b32_e32 v12, v28
+; SDAG-NEXT:    v_mov_b32_e32 v13, v29
+; SDAG-NEXT:    v_mov_b32_e32 v14, v30
+; SDAG-NEXT:    v_mov_b32_e32 v15, v31
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index b9e9893ede4e2..9fc3afb33a73a 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -373,7 +373,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v22, 0x7fc00000
 ; CHECK-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
 ; CHECK-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x3c003c00
@@ -382,69 +382,65 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x7e007e00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[12:13], v[4:7]
-; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    v_mov_b32_e32 v23, v22
+; CHECK-NEXT:    v_mov_b32_e32 v24, v22
+; CHECK-NEXT:    v_mov_b32_e32 v25, v22
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7]
+; CHECK-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25]
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
 ; CHECK-NEXT:    v_accvgpr_write_b32 a2, v2
 ; CHECK-NEXT:    v_accvgpr_write_b32 a3, v3
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; CHECK-NEXT:    v_mov_b32_e32 v5, v4
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
-; CHECK-NEXT:    v_mov_b32_e32 v7, v4
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17]
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[4:7]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7]
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[4:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7]
+; CHECK-NEXT:    v_mov_b64_e32 v[30:31], 0
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[8:9], v[8:9], v[26:29]
 ; CHECK-NEXT:    s_nop 5
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v23, v14
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[18:21]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3]
-; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    global_store_short v[30:31], v23, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3]
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_read_b32 v19, a3
 ; CHECK-NEXT:    v_accvgpr_read_b32 v18, a2
-; CHECK-NEXT:    v_mov_b64_e32 v[20:21], 0
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_wbl2 sc0 sc1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_inv sc0 sc1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v17, a1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v16, a0
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v15, v22
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[8:9], v[8:9], v[16:19]
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v12, v0
-; CHECK-NEXT:    global_store_short v[20:21], v23, off
+; CHECK-NEXT:    global_store_short v[30:31], v15, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[10:11], v[8:9], v[4:7]
-; CHECK-NEXT:    global_store_short v[20:21], v15, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3]
+; CHECK-NEXT:    global_store_short v[30:31], v14, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v14, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[10:11], v[8:9], v[4:7]
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v16
+; CHECK-NEXT:    global_store_short v[30:31], v14, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v12, v0
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v14, off
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    buffer_wbl2 sc0 sc1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v12, off
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v13, v8
+; CHECK-NEXT:    global_store_short v[30:31], v12, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v0, off
+; CHECK-NEXT:    global_store_short v[30:31], v13, off
 ; CHECK-NEXT:    s_endpgm
 entry:
   %k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
@@ -514,13 +510,13 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5]
+; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[6:9], v0, v1, v[2:5]
 ; CHECK-NEXT:    s_nop 3
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[8:9] op_sel:[1,0]
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
-; CHECK-NEXT:    v_accvgpr_write_b32 a2, v3
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, v9
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use a[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -642,46 +638,14 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
 ; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    v_accvgpr_read_b32 v24, a24
-; CHECK-NEXT:    v_accvgpr_read_b32 v25, a25
-; CHECK-NEXT:    v_accvgpr_read_b32 v26, a26
-; CHECK-NEXT:    v_accvgpr_read_b32 v27, a27
-; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
-; CHECK-NEXT:    v_accvgpr_read_b32 v2, a2
-; CHECK-NEXT:    v_accvgpr_read_b32 v3, a3
-; CHECK-NEXT:    v_accvgpr_read_b32 v4, a4
-; CHECK-NEXT:    v_accvgpr_read_b32 v5, a5
-; CHECK-NEXT:    v_accvgpr_read_b32 v6, a6
-; CHECK-NEXT:    v_accvgpr_read_b32 v7, a7
-; CHECK-NEXT:    v_accvgpr_read_b32 v8, a8
-; CHECK-NEXT:    v_accvgpr_read_b32 v9, a9
-; CHECK-NEXT:    v_accvgpr_read_b32 v10, a10
-; CHECK-NEXT:    v_accvgpr_read_b32 v11, a11
-; CHECK-NEXT:    v_accvgpr_read_b32 v12, a12
-; CHECK-NEXT:    v_accvgpr_read_b32 v13, a13
-; CHECK-NEXT:    v_accvgpr_read_b32 v14, a14
-; CHECK-NEXT:    v_accvgpr_read_b32 v15, a15
-; CHECK-NEXT:    v_accvgpr_read_b32 v16, a16
-; CHECK-NEXT:    v_accvgpr_read_b32 v17, a17
-; CHECK-NEXT:    v_accvgpr_read_b32 v18, a18
-; CHECK-NEXT:    v_accvgpr_read_b32 v19, a19
-; CHECK-NEXT:    v_accvgpr_read_b32 v20, a20
-; CHECK-NEXT:    v_accvgpr_read_b32 v21, a21
-; CHECK-NEXT:    v_accvgpr_read_b32 v22, a22
-; CHECK-NEXT:    v_accvgpr_read_b32 v23, a23
-; CHECK-NEXT:    v_accvgpr_read_b32 v28, a28
-; CHECK-NEXT:    v_accvgpr_read_b32 v29, a29
-; CHECK-NEXT:    v_accvgpr_read_b32 v30, a30
-; CHECK-NEXT:    v_accvgpr_read_b32 v31, a31
-; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; CHECK-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; CHECK-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; CHECK-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
-; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v32, a[24:27], s[2:3] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v32, a[28:31], s[2:3] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v32, a[16:19], s[2:3] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v32, a[20:23], s[2:3] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v32, a[8:11], s[2:3] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v32, a[12:15], s[2:3] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v32, a[0:3], s[2:3]
+; CHECK-NEXT:    global_store_dwordx4 v32, a[4:7], s[2:3] offset:16
 ; CHECK-NEXT:    s_endpgm
   %src2 = call <32 x float> asm sideeffect "; def $0", "=a"()
   %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0)
@@ -763,15 +727,15 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_and_b32_e32 v12, 0x3ff, v31
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1]
-; CHECK-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
+; CHECK-NEXT:    s_nop 3
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
-; CHECK-NEXT:    s_nop 8
-; CHECK-NEXT:    global_store_dwordx2 v[2:3], a[0:1], off
+; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 3, v12
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5]
+; CHECK-NEXT:    s_nop 5
+; CHECK-NEXT:    global_store_dwordx2 v[4:5], a[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %src2 = call double asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
index a81d9a458e23a..08f89b32edb20 100644
--- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
@@ -101,8 +101,13 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
 ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a2
 ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a3
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[6:9]
+; CHECK-NEXT:    ; def v[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
@@ -112,37 +117,75 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[56:59], s[16:17] offset:96
+; CHECK-NEXT:    v_accvgpr_read_b32 v0, a32
+; CHECK-NEXT:    v_mov_b32_e32 v60, 0
+; CHECK-NEXT:    v_accvgpr_read_b32 v24, a56
+; CHECK-NEXT:    v_accvgpr_read_b32 v25, a57
+; CHECK-NEXT:    v_accvgpr_read_b32 v26, a58
+; CHECK-NEXT:    v_accvgpr_read_b32 v27, a59
+; CHECK-NEXT:    global_store_dwordx4 v60, v[56:59], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v60, v[52:55], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v60, v[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v60, v[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v60, v[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v60, v[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v60, v[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[56:59], s[16:17] offset:96
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a33
+; CHECK-NEXT:    v_accvgpr_read_b32 v2, a34
+; CHECK-NEXT:    v_accvgpr_read_b32 v3, a35
+; CHECK-NEXT:    v_accvgpr_read_b32 v4, a36
+; CHECK-NEXT:    v_accvgpr_read_b32 v5, a37
+; CHECK-NEXT:    v_accvgpr_read_b32 v6, a38
+; CHECK-NEXT:    v_accvgpr_read_b32 v7, a39
+; CHECK-NEXT:    v_accvgpr_read_b32 v8, a40
+; CHECK-NEXT:    v_accvgpr_read_b32 v9, a41
+; CHECK-NEXT:    v_accvgpr_read_b32 v10, a42
+; CHECK-NEXT:    v_accvgpr_read_b32 v11, a43
+; CHECK-NEXT:    v_accvgpr_read_b32 v12, a44
+; CHECK-NEXT:    v_accvgpr_read_b32 v13, a45
+; CHECK-NEXT:    v_accvgpr_read_b32 v14, a46
+; CHECK-NEXT:    v_accvgpr_read_b32 v15, a47
+; CHECK-NEXT:    v_accvgpr_read_b32 v16, a48
+; CHECK-NEXT:    v_accvgpr_read_b32 v17, a49
+; CHECK-NEXT:    v_accvgpr_read_b32 v18, a50
+; CHECK-NEXT:    v_accvgpr_read_b32 v19, a51
+; CHECK-NEXT:    v_accvgpr_read_b32 v20, a52
+; CHECK-NEXT:    v_accvgpr_read_b32 v21, a53
+; CHECK-NEXT:    v_accvgpr_read_b32 v22, a54
+; CHECK-NEXT:    v_accvgpr_read_b32 v23, a55
+; CHECK-NEXT:    v_accvgpr_read_b32 v28, a60
+; CHECK-NEXT:    v_accvgpr_read_b32 v29, a61
+; CHECK-NEXT:    v_accvgpr_read_b32 v30, a62
+; CHECK-NEXT:    v_accvgpr_read_b32 v31, a63
+; CHECK-NEXT:    global_store_dwordx4 v60, v[24:27], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v60, v[28:31], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v60, v[16:19], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v60, v[20:23], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v60, v[8:11], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v60, v[12:15], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v60, v[0:3], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v60, v[4:7], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[6:9], s[16:17]
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v60, v[0:3], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -301,16 +344,26 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
 ; CHECK-NEXT:    v_accvgpr_write_b32 a33, v1
 ; CHECK-NEXT:    v_accvgpr_write_b32 a32, v0
 ; CHECK-NEXT:    v_accvgpr_read_b32 v7, a3
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    v_accvgpr_read_b32 v6, a2
 ; CHECK-NEXT:    v_accvgpr_read_b32 v5, a1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v4, a0
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[8:11]
+; CHECK-NEXT:    ; def v[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def v[12:15]
+; CHECK-NEXT:    ; def v[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -319,39 +372,82 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    global_store_dwordx4 v0, v[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[56:59], s[16:17] offset:96
+; CHECK-NEXT:    v_accvgpr_read_b32 v0, a32
+; CHECK-NEXT:    v_mov_b32_e32 v60, 0
+; CHECK-NEXT:    v_accvgpr_read_b32 v24, a56
+; CHECK-NEXT:    v_accvgpr_read_b32 v25, a57
+; CHECK-NEXT:    v_accvgpr_read_b32 v26, a58
+; CHECK-NEXT:    v_accvgpr_read_b32 v27, a59
+; CHECK-NEXT:    global_store_dwordx4 v60, v[56:59], s[16:17] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v60, v[52:55], s[16:17] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v60, v[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v60, v[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v60, v[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v60, v[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v60, v[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[36:39], s[16:17] offset:16
+; CHECK-NEXT:    v_accvgpr_read_b32 v1, a33
+; CHECK-NEXT:    v_accvgpr_read_b32 v2, a34
+; CHECK-NEXT:    v_accvgpr_read_b32 v3, a35
+; CHECK-NEXT:    v_accvgpr_read_b32 v4, a36
+; CHECK-NEXT:    v_accvgpr_read_b32 v5, a37
+; CHECK-NEXT:    v_accvgpr_read_b32 v6, a38
+; CHECK-NEXT:    v_accvgpr_read_b32 v7, a39
+; CHECK-NEXT:    v_accvgpr_read_b32 v8, a40
+; CHECK-NEXT:    v_accvgpr_read_b32 v9, a41
+; CHECK-NEXT:    v_accvgpr_read_b32 v10, a42
+; CHECK-NEXT:    v_accvgpr_read_b32 v11, a43
+; CHECK-NEXT:    v_accvgpr_read_b32 v12, a44
+; CHECK-NEXT:    v_accvgpr_read_b32 v13, a45
+; CHECK-NEXT:    v_accvgpr_read_b32 v14, a46
+; CHECK-NEXT:    v_accvgpr_read_b32 v15, a47
+; CHECK-NEXT:    v_accvgpr_read_b32 v16, a48
+; CHECK-NEXT:    v_accvgpr_read_b32 v17, a49
+; CHECK-NEXT:    v_accvgpr_read_b32 v18, a50
+; CHECK-NEXT:    v_accvgpr_read_b32 v19, a51
+; CHECK-NEXT:    v_accvgpr_read_b32 v20, a52
+; CHECK-NEXT:    v_accvgpr_read_b32 v21, a53
+; CHECK-NEXT:    v_accvgpr_read_b32 v22, a54
+; CHECK-NEXT:    v_accvgpr_read_b32 v23, a55
+; CHECK-NEXT:    v_accvgpr_read_b32 v28, a60
+; CHECK-NEXT:    v_accvgpr_read_b32 v29, a61
+; CHECK-NEXT:    v_accvgpr_read_b32 v30, a62
+; CHECK-NEXT:    v_accvgpr_read_b32 v31, a63
+; CHECK-NEXT:    global_store_dwordx4 v60, v[24:27], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v60, v[28:31], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[56:59], s[16:17] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v60, v[16:19], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v60, v[20:23], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v60, v[8:11], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v60, v[12:15], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v60, v[0:3], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v60, v[4:7], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[32:35], s[16:17]
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v60, v[0:3], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[8:11], s[16:17]
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[12:15], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v60, v[0:3], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload

>From 97dc027565af9e4ece826cb8e6856a22dcd8eb06 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Sun, 21 Sep 2025 02:25:04 -0400
Subject: [PATCH 09/18] Updated mir test

---
 ...amdgcn.mfma.hint.hazard.barrier.gfx942.mir | 1443 +++--------------
 1 file changed, 195 insertions(+), 1248 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
index 271b36fad2bb4..97305f2c8a8f0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck -check-prefix=GFX942_WITHOUT %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=true %s -o - | FileCheck -check-prefix=GFX942_WITH %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-avoid-hazard-hint-for-mfma=false - %s -o - | FileCheck -check-prefix=CHECK-NO-ANTIHINT %s
 
 --- |
   target triple = "amdgcn-amd-amdhsa"
@@ -17,855 +17,153 @@
 name:            test_software_pipelining
 body:             |
   bb.0:
-    ; GFX942_WITHOUT-LABEL: name: test_software_pipelining
-    ; GFX942_WITHOUT: renamable $vgpr115 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr109 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr110 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr108 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr100 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr111 = V_ADD_U32_e32 4096, $vgpr100, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr101 = V_ADD_U32_e32 $vgpr76, killed $vgpr52, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr112 = V_ADD_U32_e32 4096, $vgpr101, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr112, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 renamable $vgpr108, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr80_vgpr81, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr0 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr76, killed $vgpr0, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr82_vgpr83, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr92_vgpr93, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr94_vgpr95, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr108, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr80_vgpr81, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr82_vgpr83, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr108, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr80_vgpr81, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr72_vgpr73_vgpr74_vgpr75, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr82_vgpr83, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr92_vgpr93, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr94_vgpr95, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = DS_READ_B128_gfx9 renamable $vgpr108, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr80_vgpr81, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr108, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr50_vgpr51, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr92_vgpr93, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr50_vgpr51, $vgpr94_vgpr95, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr110, 0, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, killed $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr92_vgpr93, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, killed $vgpr94_vgpr95, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr120 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr104_vgpr105, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr106_vgpr107, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr104_vgpr105, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr114 = V_ADD_U32_e32 $vgpr115, killed $vgpr16, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr104_vgpr105, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr113 = V_ADD_U32_e32 $vgpr115, killed $vgpr20, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr106_vgpr107, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr102_vgpr103, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr104_vgpr105, killed $vgpr96_vgpr97_vgpr98_vgpr99, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr78_vgpr79, $vgpr106_vgpr107, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr78_vgpr79, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr118_vgpr119_vgpr120_vgpr121 = DS_READ_B128_gfx9 killed renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr104_vgpr105, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr116 = V_ADD_U32_e32 $vgpr115, killed $vgpr56, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr106_vgpr107, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr100_vgpr101, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr115 = V_ADD_U32_e32 killed $vgpr115, killed $vgpr72, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr102_vgpr103, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr115, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279
-    ; GFX942_WITHOUT-NEXT: S_BARRIER
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr104_vgpr105, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr120_vgpr121, killed $vgpr106_vgpr107, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr120_vgpr121, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = DS_READ_B128_gfx9 renamable $vgpr108, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr106_vgpr107, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr104_vgpr105_vgpr106_vgpr107 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr112, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr108, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr2_vgpr3, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 killed renamable $vgpr110, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr92 = IMPLICIT_DEF
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = DS_READ_B128_gfx9 renamable $vgpr92, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr92, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 renamable $vgpr92, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr82_vgpr83, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr82_vgpr83, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr92, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr115, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, $vgpr10_vgpr11, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr92, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr92, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr109, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr10_vgpr11, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr12_vgpr13, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr14_vgpr15, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 killed renamable $vgpr92, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr8_vgpr9, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr10_vgpr11, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279
-    ; GFX942_WITHOUT-NEXT: S_BARRIER
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr108, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = DS_READ_B128_gfx9 killed renamable $vgpr108, 0, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0
-    ; GFX942_WITHOUT-NEXT: S_ENDPGM 0
+    ; CHECK-LABEL: name: test_software_pipelining
+    ; CHECK: dead renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr72 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr68 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr73 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr52 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr74 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr56 = V_ADD_U32_e32 4096, $vgpr74, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr75 = V_ADD_U32_e32 $vgpr68, killed $vgpr52, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr52 = V_ADD_U32_e32 4096, $vgpr75, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr52, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr56, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr73, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: dead renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr54_vgpr55, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr56_vgpr57, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr73, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: dead renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr46_vgpr47, $vgpr58_vgpr59, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr52_vgpr53, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr46_vgpr47, $vgpr54_vgpr55, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr56_vgpr57, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr69 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr68 = V_ADD_U32_e32 killed $vgpr68, killed $vgpr69, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr68_vgpr69_vgpr70_vgpr71 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr74, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr58_vgpr59, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr52_vgpr53, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr54_vgpr55, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr73, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr68_vgpr69_vgpr70_vgpr71 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr75, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr54_vgpr55, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr73, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr56_vgpr57, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = IMPLICIT_DEF
+    ; CHECK-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr72, killed renamable $vgpr68_vgpr69_vgpr70_vgpr71, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; CHECK-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr58_vgpr59, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr52_vgpr53, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr54_vgpr55, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr73, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
+    ; CHECK-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr72, killed renamable $vgpr48_vgpr49_vgpr50_vgpr51, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; CHECK-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, killed $vgpr54_vgpr55, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = DS_READ_B128_gfx9 killed renamable $vgpr73, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr64_vgpr65, killed $vgpr56_vgpr57, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: dead renamable $vgpr40_vgpr41_vgpr42_vgpr43 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr74, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
     ;
-    ; GFX942_WITH-LABEL: name: test_software_pipelining
-    ; GFX942_WITH: renamable $vgpr96 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr121 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr122 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr52 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr120 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr123 = V_ADD_U32_e32 4096, $vgpr97, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr102 = V_ADD_U32_e32 $vgpr52, killed $vgpr0, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr124 = V_ADD_U32_e32 4096, $vgpr102, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr124, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr52, killed $vgpr0, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr6_vgpr7, $vgpr82_vgpr83, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr92_vgpr93, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr6_vgpr7, $vgpr94_vgpr95, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr122, 0, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr80_vgpr81, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, killed $vgpr82_vgpr83, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, killed $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr97, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr112_vgpr113, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr97, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr112_vgpr113, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr112_vgpr113, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr16 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr126 = V_ADD_U32_e32 $vgpr96, killed $vgpr16, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr100_vgpr101, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr104_vgpr105_vgpr106_vgpr107, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr20 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr125 = V_ADD_U32_e32 $vgpr96, killed $vgpr20, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr116_vgpr117_vgpr118_vgpr119, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr108_vgpr109_vgpr110_vgpr111, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr104_vgpr105_vgpr106_vgpr107, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 killed renamable $vgpr97, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr56 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr104 = V_ADD_U32_e32 $vgpr96, killed $vgpr56, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr60 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr127 = V_ADD_U32_e32 killed $vgpr96, killed $vgpr60, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr127, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: S_WAITCNT 49279
-    ; GFX942_WITH-NEXT: S_BARRIER
-    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr112_vgpr113, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, killed $vgpr114_vgpr115, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_BARRIER 0
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr124, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 killed renamable $vgpr122, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr105 = IMPLICIT_DEF
-    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr105, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 renamable $vgpr105, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr105, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr84_vgpr85_vgpr86_vgpr87 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr105, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr88_vgpr89_vgpr90_vgpr91 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr127, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr105, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr105, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr121, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 killed renamable $vgpr105, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr8_vgpr9, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr12_vgpr13, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: S_WAITCNT 49279
-    ; GFX942_WITH-NEXT: S_BARRIER
-    ; GFX942_WITH-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX942_WITH-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = DS_READ_B128_gfx9 killed renamable $vgpr120, 0, 0, implicit $exec :: (load (s128), addrspace 3)
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
-    ; GFX942_WITH-NEXT: SCHED_BARRIER 0
-    ; GFX942_WITH-NEXT: S_ENDPGM 0
+    ; CHECK-NO-ANTIHINT-LABEL: name: test_software_pipelining
+    ; CHECK-NO-ANTIHINT: dead renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr68 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr69 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr70 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr52 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr71 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr56 = V_ADD_U32_e32 4096, $vgpr71, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr72 = V_ADD_U32_e32 $vgpr69, killed $vgpr52, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr52 = V_ADD_U32_e32 4096, $vgpr72, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr52, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr56, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr56_vgpr57, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr70, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr58_vgpr59, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr52_vgpr53, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr54_vgpr55, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr56_vgpr57, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = DS_READ_B128_gfx9 renamable $vgpr70, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr46_vgpr47, $vgpr58_vgpr59, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr44_vgpr45, $vgpr52_vgpr53, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr46_vgpr47, $vgpr54_vgpr55, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr56_vgpr57, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr36 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr36 = V_ADD_U32_e32 killed $vgpr69, killed $vgpr36, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr71, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr58_vgpr59, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr52_vgpr53, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr54_vgpr55, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr70, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr56_vgpr57, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr72, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr50_vgpr51, $vgpr58_vgpr59, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr52_vgpr53, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr50_vgpr51, $vgpr54_vgpr55, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr70, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr56_vgpr57, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr68, killed renamable $vgpr24_vgpr25_vgpr26_vgpr27, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr58_vgpr59, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr52_vgpr53, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr54_vgpr55, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = DS_READ_B128_gfx9 renamable $vgpr70, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr56_vgpr57, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr14_vgpr15_vgpr16_vgpr17 = IMPLICIT_DEF
+    ; CHECK-NO-ANTIHINT-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr68, killed renamable $vgpr14_vgpr15_vgpr16_vgpr17, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr58_vgpr59, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr52_vgpr53, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, killed $vgpr54_vgpr55, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 killed renamable $vgpr70, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr12_vgpr13, killed $vgpr56_vgpr57, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NO-ANTIHINT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr71, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
     %0:vgpr_32 = IMPLICIT_DEF
     %1:vgpr_32 = IMPLICIT_DEF
     %2:vgpr_32 = IMPLICIT_DEF
@@ -890,403 +188,52 @@ body:             |
     %21:vreg_128_align2 = IMPLICIT_DEF
     %22:vreg_128_align2 = IMPLICIT_DEF
     %23:vreg_128_align2 = IMPLICIT_DEF
-    %25:vgpr_32 = IMPLICIT_DEF
-    %24:vgpr_32 = V_ADD_U32_e32 4096, %25:vgpr_32, implicit $exec
-    %27:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %7:vgpr_32, implicit $exec
-    %26:vgpr_32 = V_ADD_U32_e32 4096, %27:vgpr_32, implicit $exec
-    %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %31:vreg_128_align2 = IMPLICIT_DEF
-    %30:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %23:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %32:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %30:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %22:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %34:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %37:vreg_128_align2 = IMPLICIT_DEF
-    %36:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %21:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %38:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %36:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %20:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %40:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %19:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = V_ADD_U32_e32 4096, %24, implicit $exec
+    %26:vgpr_32 = V_ADD_U32_e32 %3, %7, implicit $exec
+    %27:vgpr_32 = V_ADD_U32_e32 4096, %26, implicit $exec
+    %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25, %6, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %30:vreg_128_align2 = IMPLICIT_DEF
+    %31:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub0_sub1, %29.sub0_sub1, %23, 0, 0, 0, implicit $mode, implicit $exec
+    %32:vreg_128_align2 = DS_READ_B128_gfx9 %4, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub2_sub3, %29.sub2_sub3, %31, 0, 0, 0, implicit $mode, implicit $exec
+    %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub0_sub1, %28.sub0_sub1, %22, 0, 0, 0, implicit $mode, implicit $exec
+    %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %30.sub2_sub3, %28.sub2_sub3, %34, 0, 0, 0, implicit $mode, implicit $exec
+    %36:vreg_128_align2 = IMPLICIT_DEF
+    %37:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub0_sub1, %29.sub0_sub1, %21, 0, 0, 0, implicit $mode, implicit $exec
+    %38:vreg_128_align2 = DS_READ_B128_gfx9 %4, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub2_sub3, %29.sub2_sub3, %37, 0, 0, 0, implicit $mode, implicit $exec
+    %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub0_sub1, %28.sub0_sub1, %20, 0, 0, 0, implicit $mode, implicit $exec
+    %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %36.sub2_sub3, %28.sub2_sub3, %40, 0, 0, 0, implicit $mode, implicit $exec
+    %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1, %29.sub0_sub1, %19, 0, 0, 0, implicit $mode, implicit $exec
     %43:vgpr_32 = IMPLICIT_DEF
-    %925:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %43:vgpr_32, implicit $exec
-    %44:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %45:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %42:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %46:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %18:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %47:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %46:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %48:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
-    %49:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %17:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %50:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %51:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %49:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %52:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %16:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %53:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %52:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %54:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
-    %55:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %15:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %56:vreg_128_align2 = IMPLICIT_DEF
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %56:vreg_128_align2, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
-    %57:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %55:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %58:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %14:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %59:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %58:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %60:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
-    %61:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %13:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %62:vreg_128_align2 = IMPLICIT_DEF
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %62:vreg_128_align2, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
-    %63:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %61:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %64:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %12:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %65:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %64:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %66:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
-    %67:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %11:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %68:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %69:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %67:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %70:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %10:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %71:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %70:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %72:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3)
-    %73:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %9:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %74:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %75:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %73:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %76:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %8:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %77:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %76:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %79:vgpr_32 = IMPLICIT_DEF
-    %78:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
-    %81:vreg_128_align2 = IMPLICIT_DEF
-    %80:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %33:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %82:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
-    %83:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %80:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %85:vreg_128_align2 = IMPLICIT_DEF
-    %84:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %35:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %86:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %84:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %87:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %39:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %88:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
-    %89:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %87:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %90:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %41:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %91:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %90:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %92:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %45:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %94:vgpr_32 = IMPLICIT_DEF
-    %93:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %94:vgpr_32, implicit $exec
-    %95:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %96:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %92:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %97:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %47:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %98:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %97:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %99:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
-    %100:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %51:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %102:vgpr_32 = IMPLICIT_DEF
-    %101:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %102:vgpr_32, implicit $exec
-    %103:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %104:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %100:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %105:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %53:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %106:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %105:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %107:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
-    %108:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %57:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %109:vreg_128_align2 = IMPLICIT_DEF
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %109:vreg_128_align2, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
-    %110:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %108:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %111:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %59:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %112:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %111:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %113:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
-    %114:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %63:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %115:vreg_128_align2 = IMPLICIT_DEF
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %115:vreg_128_align2, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
-    %116:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %114:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %117:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %65:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %118:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %117:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %119:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
-    %120:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %69:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %122:vgpr_32 = IMPLICIT_DEF
-    %121:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %122:vgpr_32, implicit $exec
-    %123:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %124:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %120:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %125:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %71:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %127:vgpr_32 = IMPLICIT_DEF
-    %126:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %127:vgpr_32, implicit $exec
-    %128:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %125:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %129:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    S_WAITCNT 49279
-    S_BARRIER
-    %130:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
-    %131:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %75:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %132:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %131:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %133:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %77:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %134:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %133:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %135:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_BARRIER 0
-    %136:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %83:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %137:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
-    %138:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %136:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %139:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %86:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %140:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %139:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %141:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %89:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %142:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
-    %143:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %141:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %144:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %91:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %145:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %144:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %146:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %96:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %147:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %146:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %148:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %98:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %149:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %148:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %150:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
-    %151:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %104:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %152:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %151:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %153:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %106:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %154:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %153:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %155:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
-    %156:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %110:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %95:vreg_128_align2, 0, 0, implicit $exec :: (store (s128), addrspace 3)
-    %157:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %156:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %158:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %112:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %159:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %158:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %160:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
-    %161:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %116:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %103:vreg_128_align2, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
-    %162:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %161:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %163:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %118:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %164:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %163:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %165:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
-    %166:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %124:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %981:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %167:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %166:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %168:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %128:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %169:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %168:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %170:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
-    %171:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %132:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %985:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %172:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %171:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %173:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %134:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %174:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %173:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %176:vgpr_32 = IMPLICIT_DEF
-    %175:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
-    %177:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %138:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %178:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
-    %179:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %177:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %180:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %140:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %962:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %180:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %182:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %143:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %183:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
-    %961:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %182:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %185:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %145:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %960:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %185:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %187:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %147:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %956:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %959:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %187:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %189:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %149:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %958:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %189:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %191:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
-    %192:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %152:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %962:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %957:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %192:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %194:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %154:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %956:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %194:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %196:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
-    %197:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %157:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %123:vreg_128_align2, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
-    %955:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %197:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %199:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %159:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %954:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %199:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %201:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
-    %202:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %162:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    DS_WRITE_B128_gfx9 %1:vgpr_32, %129:vreg_128_align2, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
-    %953:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %202:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %204:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %164:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %952:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %204:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %206:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
-    %207:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %167:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %910:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    %951:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %207:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %209:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %169:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %950:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %209:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %911:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
-    S_WAITCNT 49279
-    S_BARRIER
-    %937:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
-    %211:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %172:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %949:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %211:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %213:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %174:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %948:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %213:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
-    %931:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3)
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 512, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 32, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 8, 1, 0
-    SCHED_GROUP_BARRIER 256, 1, 0
-    SCHED_BARRIER 0
-    S_ENDPGM 0
+    %44:vgpr_32 = V_ADD_U32_e32 %3, %43, implicit $exec
+    %45:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24, %6, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %46:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3, %29.sub2_sub3, %42, 0, 0, 0, implicit $mode, implicit $exec
+    %47:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1, %28.sub0_sub1, %18, 0, 0, 0, implicit $mode, implicit $exec
+    %48:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3, %28.sub2_sub3, %47, 0, 0, 0, implicit $mode, implicit $exec
+    %49:vreg_128_align2 = DS_READ_B128_gfx9 %4, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    %50:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1, %29.sub0_sub1, %17, 0, 0, 0, implicit $mode, implicit $exec
+    %51:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26, %6, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %52:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3, %29.sub2_sub3, %50, 0, 0, 0, implicit $mode, implicit $exec
+    %53:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1, %28.sub0_sub1, %16, 0, 0, 0, implicit $mode, implicit $exec
+    %54:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3, %28.sub2_sub3, %53, 0, 0, 0, implicit $mode, implicit $exec
+    %55:vreg_128_align2 = DS_READ_B128_gfx9 %4, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    %56:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub0_sub1, %29.sub0_sub1, %15, 0, 0, 0, implicit $mode, implicit $exec
+    %57:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1, %57, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    %58:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub2_sub3, %29.sub2_sub3, %56, 0, 0, 0, implicit $mode, implicit $exec
+    %59:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub0_sub1, %28.sub0_sub1, %14, 0, 0, 0, implicit $mode, implicit $exec
+    %60:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %49.sub2_sub3, %28.sub2_sub3, %59, 0, 0, 0, implicit $mode, implicit $exec
+    %61:vreg_128_align2 = DS_READ_B128_gfx9 %4, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    %62:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub0_sub1, %29.sub0_sub1, %13, 0, 0, 0, implicit $mode, implicit $exec
+    %63:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1, %63, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    %64:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub2_sub3, %29.sub2_sub3, %62, 0, 0, 0, implicit $mode, implicit $exec
+    %65:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub0_sub1, %28.sub0_sub1, %12, 0, 0, 0, implicit $mode, implicit $exec
+    %66:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %55.sub2_sub3, %28.sub2_sub3, %65, 0, 0, 0, implicit $mode, implicit $exec
+    %67:vreg_128_align2 = DS_READ_B128_gfx9 %4, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    %68:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %61.sub0_sub1, %29.sub0_sub1, %11, 0, 0, 0, implicit $mode, implicit $exec
+    %69:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24, %6, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
 ...

>From faafc4f98291cf2e24b6aefa64e7b0dc07f0e166 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Sun, 21 Sep 2025 02:33:00 -0400
Subject: [PATCH 10/18] Renamed test file

---
 ....barrier.gfx942.mir => llvm.amdgcn.mfma.anti-hints.gfx942.mir} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/CodeGen/AMDGPU/{llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir => llvm.amdgcn.mfma.anti-hints.gfx942.mir} (100%)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
rename to llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir

>From 2c999936d7f461a4a1523e3b0c95d75e4c7bfda5 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 14:42:41 -0400
Subject: [PATCH 11/18] Added print and parse tests

---
 ...vm.amdgcn.mfma.anti-hints-parse.gfx942.mir | 195 ++++++++++++++++++
 ...vm.amdgcn.mfma.anti-hints-print.gfx942.mir | 126 +++++++++++
 .../llvm.amdgcn.mfma.anti-hints.gfx942.mir    |   4 +-
 3 files changed, 323 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
new file mode 100644
index 0000000000000..905fff8b642cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
@@ -0,0 +1,195 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -debug -run-pass=greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s
+--- |
+  ; ModuleID = '/work/mdssefat/FullTimeWork/MLSCHED/composable_kernel/noopexample/llvm.amdgcn.mfma.hint.haard.barrier.gfx942_short.mir'
+  source_filename = "/work/mdssefat/FullTimeWork/MLSCHED/composable_kernel/noopexample/llvm.amdgcn.mfma.hint.haard.barrier.gfx942_short.mir"
+  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+  target triple = "amdgcn-amd-amdhsa"
+
+  ; Function Attrs: nounwind
+  define amdgpu_kernel void @test_software_pipelining() #0 {
+  bb.0:
+    ret void
+  }
+
+  attributes #0 = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-waves-per-eu"="2" "frame-pointer"="none" "target-cpu"="gfx942" }
+...
+---
+name:            test_software_pipelining
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+                                                                                '%27',
+                                                                                '%4',
+                                                                                '%26',
+                                                                                '%25',
+                                                                                '%5',
+                                                                                '%24',
+                                                                                '%22',
+                                                                                '%6',
+                                                                                '%20',
+                                                                                '%19',
+                                                                                '%7',
+                                                                                '%18',
+                                                                                '%16',
+                                                                                '%8' ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+                                                                                '%16',
+                                                                                '%8',
+                                                                                '%22',
+                                                                                '%6',
+                                                                                '%20',
+                                                                                '%19',
+                                                                                '%7',
+                                                                                '%18' ] }
+  - { id: 2, class: sgpr_128, preferred-register: '', flags: [  ] }
+  - { id: 3, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%17', '%1', '%23', '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 9, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+                                                                                '%27',
+                                                                                '%4',
+                                                                                '%26',
+                                                                                '%25',
+                                                                                '%5',
+                                                                                '%24',
+                                                                                '%22',
+                                                                                '%6',
+                                                                                '%20',
+                                                                                '%19',
+                                                                                '%7',
+                                                                                '%18',
+                                                                                '%16',
+                                                                                '%8' ] }
+  - { id: 10, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 11, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 12, class: vgpr_32, preferred-register: '', flags: [  ] }
+  - { id: 13, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 14, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 15, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%16', '%8' ] }
+  - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 20, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 21, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%22', '%6', '%20', '%19', '%7', '%18', '%16', '%8' ] }
+  - { id: 24, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
+  - { id: 26, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 27, class: vreg_128_align2, preferred-register: '', flags: [  ] }
+  - { id: 28, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+                                                                                 '%27',
+                                                                                 '%4',
+                                                                                 '%26',
+                                                                                 '%25',
+                                                                                 '%5',
+                                                                                 '%24',
+                                                                                 '%22',
+                                                                                 '%6',
+                                                                                 '%20',
+                                                                                 '%19',
+                                                                                 '%7',
+                                                                                 '%18',
+                                                                                 '%16',
+                                                                                 '%8' ] }
+  - { id: 29, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+                                                                                 '%27',
+                                                                                 '%4',
+                                                                                 '%26',
+                                                                                 '%25',
+                                                                                 '%5',
+                                                                                 '%24',
+                                                                                 '%22',
+                                                                                 '%6',
+                                                                                 '%20',
+                                                                                 '%19',
+                                                                                 '%7',
+                                                                                 '%18',
+                                                                                 '%16',
+                                                                                 '%8' ] }
+  - { id: 30, class: vreg_128_align2, preferred-register: '', flags: [  ],
+      anti-hints: [ '%27', '%4', '%26', '%25', '%5', '%24', '%22', '%6',
+                    '%20', '%19', '%7', '%18', '%16', '%8' ] }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_software_pipelining
+    ; CHECK: renamable $vgpr36 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr37 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr38 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr24 = V_ADD_U32_e32 4096, $vgpr38, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr20 = V_ADD_U32_e32 $vgpr36, killed $vgpr20, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr20 = V_ADD_U32_e32 4096, killed $vgpr20, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr20, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr24, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr24_vgpr25, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = DS_READ_B128_gfx9 renamable $vgpr37, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr26_vgpr27, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr20_vgpr21, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr22_vgpr23, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr24_vgpr25, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 killed renamable $vgpr37, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr26_vgpr27, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr20_vgpr21, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, killed $vgpr22_vgpr23, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr32_vgpr33, killed $vgpr24_vgpr25, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead renamable $vgpr20 = V_ADD_U32_e32 killed $vgpr36, killed $vgpr20, implicit $exec
+    ; CHECK-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr38, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr28_vgpr29_vgpr30_vgpr31, implicit killed renamable $vgpr8_vgpr9_vgpr10_vgpr11, implicit killed renamable $vgpr12_vgpr13_vgpr14_vgpr15, implicit killed renamable $vgpr4_vgpr5_vgpr6_vgpr7, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed renamable $vgpr20_vgpr21_vgpr22_vgpr23
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:sgpr_128 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vreg_128_align2 = IMPLICIT_DEF
+    %5:vreg_128_align2 = IMPLICIT_DEF
+    %6:vreg_128_align2 = IMPLICIT_DEF
+    %7:vreg_128_align2 = IMPLICIT_DEF
+    %8:vreg_128_align2 = IMPLICIT_DEF
+    %9:vgpr_32 = IMPLICIT_DEF
+    %10:vgpr_32 = V_ADD_U32_e32 4096, %9, implicit $exec
+    %11:vgpr_32 = V_ADD_U32_e32 %0, %3, implicit $exec
+    %12:vgpr_32 = V_ADD_U32_e32 4096, %11, implicit $exec
+    %13:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %12, %2, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %14:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %10, %2, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %15:vreg_128_align2 = IMPLICIT_DEF
+    %16:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub0_sub1, %14.sub0_sub1, %8, 0, 0, 0, implicit $mode, implicit $exec
+    %17:vreg_128_align2 = DS_READ_B128_gfx9 %1, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    dead %18:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub2_sub3, %14.sub2_sub3, %16, 0, 0, 0, implicit $mode, implicit $exec
+    %19:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub0_sub1, %13.sub0_sub1, %7, 0, 0, 0, implicit $mode, implicit $exec
+    %20:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %15.sub2_sub3, %13.sub2_sub3, %19, 0, 0, 0, implicit $mode, implicit $exec
+    %21:vreg_128_align2 = IMPLICIT_DEF
+    %22:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub0_sub1, %14.sub0_sub1, %6, 0, 0, 0, implicit $mode, implicit $exec
+    %23:vreg_128_align2 = DS_READ_B128_gfx9 %1, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    %24:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub2_sub3, %14.sub2_sub3, %22, 0, 0, 0, implicit $mode, implicit $exec
+    %25:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub0_sub1, %13.sub0_sub1, %5, 0, 0, 0, implicit $mode, implicit $exec
+    %26:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %21.sub2_sub3, %13.sub2_sub3, %25, 0, 0, 0, implicit $mode, implicit $exec
+    %27:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %17.sub0_sub1, %14.sub0_sub1, %4, 0, 0, 0, implicit $mode, implicit $exec
+    %28:vgpr_32 = IMPLICIT_DEF
+    dead %29:vgpr_32 = V_ADD_U32_e32 %0, %28, implicit $exec
+    %30:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %9, %2, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    S_ENDPGM 0, implicit %23, implicit %24, implicit %20, implicit %26, implicit %27, implicit %30
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir
new file mode 100644
index 0000000000000..d55dbb4ea0e5f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir
@@ -0,0 +1,126 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations %s -o - | FileCheck -check-prefix=CHECK %s
+
+--- |
+  target triple = "amdgcn-amd-amdhsa"
+
+  define amdgpu_kernel void @test_software_pipelining() #0 {
+    bb.0:
+      ret void
+  }
+
+  attributes #0 = {nounwind "amdgpu-waves-per-eu"="2"  "amdgpu-agpr-alloc"="0" "frame-pointer"="none"}
+
+...
+---
+name:            test_software_pipelining
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_software_pipelining
+    ; CHECK: registers:
+    ; CHECK:  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK:{{\s*}}'%8' ] }
+    ; CHECK:  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
+    ; CHECK-NEXT:{{\s*}}'%16',
+    ; CHECK:{{\s*}}'%18' ] }
+    ; CHECK:  - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 5, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 6, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 7, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 8, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%17'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 9, class: vgpr_32, preferred-register:  '', flags: [  ], anti-hints: [
+    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK:{{\s*}}'%8' ] }
+    ; CHECK:  - { id: 16, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 17, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%16'{{.*}}'%8' ] }
+    ; CHECK:  - { id: 18, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 19, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 22, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 23, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%22'{{.*}}'%8' ] }
+    ; CHECK:  - { id: 25, class: vreg_128_align2, preferred-register:  '', flags: [  ],
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
+    ; CHECK:  - { id: 28, class: vgpr_32, preferred-register:  '', flags: [  ], anti-hints: [
+    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK:{{\s*}}'%8' ] }
+    ; CHECK:  - { id: 29, class: vgpr_32, preferred-register:  '', flags: [ ], anti-hints: [
+    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK:{{\s*}}'%8' ] }
+    ; CHECK:  - { id: 30, class: vreg_128_align2
+    ; CHECK-NEXT: {{.*}}anti-hints: [ '%27'
+    ; CHECK: {{.*}}'%8' ] }
+    ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF4:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF5:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF6:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF7:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF8:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 4096, [[DEF9]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF]], [[DEF3]], implicit $exec
+    ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 4096, [[V_ADD_U32_e32_1]], implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_2]], [[DEF2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[V_ADD_U32_e32_]], [[DEF2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: [[DEF10:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF8]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF1]], 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: dead [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_1:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[DEF7]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF10]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_2]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[DEF11:%[0-9]+]]:vreg_128_align2 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF6]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[DS_READ_B128_gfx9_1:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF1]], 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_4]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0_sub1, [[DEF5]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DEF11]].sub2_sub3, [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2_sub3, [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_6]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8:%[0-9]+]]:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 [[DS_READ_B128_gfx9_]].sub0_sub1, [[BUFFER_LOAD_DWORDX4_OFFEN1]].sub0_sub1, [[DEF4]], 0, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: dead [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF]], [[DEF12]], implicit $exec
+    ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN [[DEF9]], [[DEF2]], 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; CHECK-NEXT: S_ENDPGM 0, implicit [[DS_READ_B128_gfx9_1]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_5]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_3]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_7]], implicit [[V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64_8]], implicit [[BUFFER_LOAD_DWORDX4_OFFEN2]]
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = IMPLICIT_DEF
+    %6:sgpr_128 = IMPLICIT_DEF
+    %7:vgpr_32 = IMPLICIT_DEF
+    %19:vreg_128_align2 = IMPLICIT_DEF
+    %20:vreg_128_align2 = IMPLICIT_DEF
+    %21:vreg_128_align2 = IMPLICIT_DEF
+    %22:vreg_128_align2 = IMPLICIT_DEF
+    %23:vreg_128_align2 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = V_ADD_U32_e32 4096, %25:vgpr_32, implicit $exec
+    %27:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %7:vgpr_32, implicit $exec
+    %26:vgpr_32 = V_ADD_U32_e32 4096, %27:vgpr_32, implicit $exec
+    %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %31:vreg_128_align2 = IMPLICIT_DEF
+    %30:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %23:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %32:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %30:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %22:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %34:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %37:vreg_128_align2 = IMPLICIT_DEF
+    %36:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %21:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %38:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %36:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %20:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %40:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %19:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %43:vgpr_32 = IMPLICIT_DEF
+    %925:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %43:vgpr_32, implicit $exec
+    %44:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    S_ENDPGM 0, implicit %38, implicit %39, implicit %35, implicit %41, implicit %42, implicit %44
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
index 97305f2c8a8f0..d360eccaeb773 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck -check-prefix=CHECK %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-avoid-hazard-hint-for-mfma=false - %s -o - | FileCheck -check-prefix=CHECK-NO-ANTIHINT %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck %s --check-prefix=CHECK
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck %s --check-prefix=CHECK-NO-ANTIHINT
 
 --- |
   target triple = "amdgcn-amd-amdhsa"

>From 29d01010280e0d2400cecf0b1966d46fde6941bb Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 15:34:06 -0400
Subject: [PATCH 12/18] Fixed typo

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index f63eea716d68b..e0eecb06e2d32 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -323,7 +323,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
               // Check if MFMA register is dead at current instruction
               const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg);
               if (!MFMAInterval.liveAt(CurrentSlot)) {
-                // Add bidirectional antihints
+                // Add bi-directional anti-hints
                 MRI->addRegAllocationAntiHints(CandidateReg, MFMARegs);
                 MRI->addRegAllocationAntiHints(MFMAReg, CandidateReg);
               }

>From b41264ea86d0da0b917adac0c156264fe510d7e8 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 18:02:20 -0400
Subject: [PATCH 13/18] Fixed typo

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index e0eecb06e2d32..098ca1120c85c 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -324,7 +324,7 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
               const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg);
               if (!MFMAInterval.liveAt(CurrentSlot)) {
                 // Add bi-directional anti-hints
-                MRI->addRegAllocationAntiHints(CandidateReg, MFMARegs);
+                MRI->addRegAllocationAntiHints(CandidateReg, MFMAReg);
                 MRI->addRegAllocationAntiHints(MFMAReg, CandidateReg);
               }
             }

>From 032800bdd6aef2a729ab3376006dcd69c328403b Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 18:28:30 -0400
Subject: [PATCH 14/18] Fixed test!

---
 ...lvm.amdgcn.mfma.anti-hints-print.gfx942.mir | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir
index d55dbb4ea0e5f..c6de026d447fd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-print.gfx942.mir
@@ -18,11 +18,11 @@ body:             |
     ; CHECK-LABEL: name: test_software_pipelining
     ; CHECK: registers:
     ; CHECK:  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK-NEXT:{{\s*}}'%4',
     ; CHECK:{{\s*}}'%8' ] }
     ; CHECK:  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%16',
-    ; CHECK:{{\s*}}'%18' ] }
+    ; CHECK-NEXT:{{\s*}}'%8',
+    ; CHECK:{{\s*}}'%16' ] }
     ; CHECK:  - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [  ],
     ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
     ; CHECK:  - { id: 5, class: vreg_128_align2, preferred-register:  '', flags: [  ],
@@ -34,12 +34,12 @@ body:             |
     ; CHECK:  - { id: 8, class: vreg_128_align2, preferred-register:  '', flags: [  ],
     ; CHECK-NEXT:{{\s*}}anti-hints: [ '%17'{{.*}}'%9' ] }
     ; CHECK:  - { id: 9, class: vgpr_32, preferred-register:  '', flags: [  ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK-NEXT:{{\s*}}'%4',
     ; CHECK:{{\s*}}'%8' ] }
     ; CHECK:  - { id: 16, class: vreg_128_align2, preferred-register:  '', flags: [  ],
     ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
     ; CHECK:  - { id: 17, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%16'{{.*}}'%8' ] }
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%8' ] }
     ; CHECK:  - { id: 18, class: vreg_128_align2, preferred-register:  '', flags: [  ],
     ; CHECK-NEXT:{{\s*}}anti-hints: [ '%23'{{.*}}'%9' ] }
     ; CHECK:  - { id: 19, class: vreg_128_align2, preferred-register:  '', flags: [  ],
@@ -47,17 +47,17 @@ body:             |
     ; CHECK:  - { id: 22, class: vreg_128_align2, preferred-register:  '', flags: [  ],
     ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
     ; CHECK:  - { id: 23, class: vreg_128_align2, preferred-register:  '', flags: [  ],
-    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%22'{{.*}}'%8' ] }
+    ; CHECK-NEXT:{{\s*}}anti-hints: [ '%6'{{.*}}'%8' ] }
     ; CHECK:  - { id: 25, class: vreg_128_align2, preferred-register:  '', flags: [  ],
     ; CHECK-NEXT:{{\s*}}anti-hints: [ '%29'{{.*}}'%9' ] }
     ; CHECK:  - { id: 28, class: vgpr_32, preferred-register:  '', flags: [  ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK-NEXT:{{\s*}}'%4',
     ; CHECK:{{\s*}}'%8' ] }
     ; CHECK:  - { id: 29, class: vgpr_32, preferred-register:  '', flags: [ ], anti-hints: [
-    ; CHECK-NEXT:{{\s*}}'%27',
+    ; CHECK-NEXT:{{\s*}}'%4',
     ; CHECK:{{\s*}}'%8' ] }
     ; CHECK:  - { id: 30, class: vreg_128_align2
-    ; CHECK-NEXT: {{.*}}anti-hints: [ '%27'
+    ; CHECK-NEXT: {{.*}}anti-hints: [ '%4'
     ; CHECK: {{.*}}'%8' ] }
     ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF

>From ee47ea7938324c40f7919c8d2cfc0e340aa83440 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 18:33:12 -0400
Subject: [PATCH 15/18] Fixed test

---
 ...vm.amdgcn.mfma.anti-hints-parse.gfx942.mir | 146 ++++++++----------
 1 file changed, 64 insertions(+), 82 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
index 905fff8b642cc..89ac0978a0f72 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints-parse.gfx942.mir
@@ -16,56 +16,46 @@
 ---
 name:            test_software_pipelining
 registers:
-  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-                                                                                '%27',
-                                                                                '%4',
-                                                                                '%26',
-                                                                                '%25',
-                                                                                '%5',
-                                                                                '%24',
-                                                                                '%22',
-                                                                                '%6',
-                                                                                '%20',
-                                                                                '%19',
-                                                                                '%7',
-                                                                                '%18',
-                                                                                '%16',
+  - { id: 0, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
+                                                                                '%4', 
+                                                                                '%25', 
+                                                                                '%5', 
+                                                                                '%22', 
+                                                                                '%6', 
+                                                                                '%19', 
+                                                                                '%7', 
+                                                                                '%18', 
+                                                                                '%16', 
                                                                                 '%8' ] }
-  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-                                                                                '%16',
-                                                                                '%8',
-                                                                                '%22',
-                                                                                '%6',
-                                                                                '%20',
-                                                                                '%19',
-                                                                                '%7',
-                                                                                '%18' ] }
+  - { id: 1, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
+                                                                                '%8', 
+                                                                                '%6', 
+                                                                                '%19', 
+                                                                                '%7', 
+                                                                                '%18', 
+                                                                                '%16' ] }
   - { id: 2, class: sgpr_128, preferred-register: '', flags: [  ] }
   - { id: 3, class: vgpr_32, preferred-register: '', flags: [  ] }
-  - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 4, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 5, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 6, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 7, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 8, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%17', '%1', '%23', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 9, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-                                                                                '%27',
-                                                                                '%4',
-                                                                                '%26',
-                                                                                '%25',
-                                                                                '%5',
-                                                                                '%24',
-                                                                                '%22',
-                                                                                '%6',
-                                                                                '%20',
-                                                                                '%19',
-                                                                                '%7',
-                                                                                '%18',
-                                                                                '%16',
+  - { id: 9, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
+                                                                                '%4', 
+                                                                                '%25', 
+                                                                                '%5', 
+                                                                                '%22', 
+                                                                                '%6', 
+                                                                                '%19', 
+                                                                                '%7', 
+                                                                                '%18', 
+                                                                                '%16', 
                                                                                 '%8' ] }
   - { id: 10, class: vgpr_32, preferred-register: '', flags: [  ] }
   - { id: 11, class: vgpr_32, preferred-register: '', flags: [  ] }
@@ -73,58 +63,50 @@ registers:
   - { id: 13, class: vreg_128_align2, preferred-register: '', flags: [  ] }
   - { id: 14, class: vreg_128_align2, preferred-register: '', flags: [  ] }
   - { id: 15, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 16, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [  ],
-      anti-hints: [ '%16', '%8' ] }
-  - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 17, class: vreg_128_align2, preferred-register: '', flags: [  ], 
+      anti-hints: [ '%8' ] }
+  - { id: 18, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 19, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%23', '%1', '%29', '%0', '%28', '%30', '%9' ] }
   - { id: 20, class: vreg_128_align2, preferred-register: '', flags: [  ] }
   - { id: 21, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 22, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
-  - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [  ],
-      anti-hints: [ '%22', '%6', '%20', '%19', '%7', '%18', '%16', '%8' ] }
+  - { id: 23, class: vreg_128_align2, preferred-register: '', flags: [  ], 
+      anti-hints: [ '%6', '%19', '%7', '%18', '%16', '%8' ] }
   - { id: 24, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [  ],
+  - { id: 25, class: vreg_128_align2, preferred-register: '', flags: [  ], 
       anti-hints: [ '%29', '%0', '%28', '%30', '%9' ] }
   - { id: 26, class: vreg_128_align2, preferred-register: '', flags: [  ] }
   - { id: 27, class: vreg_128_align2, preferred-register: '', flags: [  ] }
-  - { id: 28, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-                                                                                 '%27',
-                                                                                 '%4',
-                                                                                 '%26',
-                                                                                 '%25',
-                                                                                 '%5',
-                                                                                 '%24',
-                                                                                 '%22',
-                                                                                 '%6',
-                                                                                 '%20',
-                                                                                 '%19',
-                                                                                 '%7',
-                                                                                 '%18',
-                                                                                 '%16',
+  - { id: 28, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
+                                                                                 '%4', 
+                                                                                 '%25', 
+                                                                                 '%5', 
+                                                                                 '%22', 
+                                                                                 '%6', 
+                                                                                 '%19', 
+                                                                                 '%7', 
+                                                                                 '%18', 
+                                                                                 '%16', 
                                                                                  '%8' ] }
-  - { id: 29, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [
-                                                                                 '%27',
-                                                                                 '%4',
-                                                                                 '%26',
-                                                                                 '%25',
-                                                                                 '%5',
-                                                                                 '%24',
-                                                                                 '%22',
-                                                                                 '%6',
-                                                                                 '%20',
-                                                                                 '%19',
-                                                                                 '%7',
-                                                                                 '%18',
-                                                                                 '%16',
+  - { id: 29, class: vgpr_32, preferred-register: '', flags: [  ], anti-hints: [ 
+                                                                                 '%4', 
+                                                                                 '%25', 
+                                                                                 '%5', 
+                                                                                 '%22', 
+                                                                                 '%6', 
+                                                                                 '%19', 
+                                                                                 '%7', 
+                                                                                 '%18', 
+                                                                                 '%16', 
                                                                                  '%8' ] }
-  - { id: 30, class: vreg_128_align2, preferred-register: '', flags: [  ],
-      anti-hints: [ '%27', '%4', '%26', '%25', '%5', '%24', '%22', '%6',
-                    '%20', '%19', '%7', '%18', '%16', '%8' ] }
+  - { id: 30, class: vreg_128_align2, preferred-register: '', flags: [  ], 
+      anti-hints: [ '%4', '%25', '%5', '%22', '%6', '%19', '%7', '%18', 
+                    '%16', '%8' ] }
 body:             |
   bb.0:
     ; CHECK-LABEL: name: test_software_pipelining

>From 0c5d1d9f1a8d18077d88e4a18ea769638769e15f Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 19:10:29 -0400
Subject: [PATCH 16/18] Fixed typo

---
 llvm/lib/CodeGen/AllocationOrder.cpp | 4 ++--
 llvm/lib/CodeGen/AllocationOrder.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/AllocationOrder.cpp b/llvm/lib/CodeGen/AllocationOrder.cpp
index f57df79128c64..f420c96e212d0 100644
--- a/llvm/lib/CodeGen/AllocationOrder.cpp
+++ b/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -62,7 +62,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
   // Create allocation order object
   AllocationOrder AO(std::move(Hints), Order, HardHints);
   
-  // Apply anti-hint filtering if needed
+  // Apply anti-hints filtering if needed
   if (!AntiHintedPhysRegs.empty()) {
     AO.applyAntiHints(AntiHintedPhysRegs, TRI);
     
@@ -103,7 +103,7 @@ void AllocationOrder::applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs,
     }
   }
   
-  // Update Order to point to our filtered storage
+  // Update Order
   Order = FilteredOrderStorage;
   
   LLVM_DEBUG({
diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h
index 842f83d957a6d..029d9c83baf35 100644
--- a/llvm/lib/CodeGen/AllocationOrder.h
+++ b/llvm/lib/CodeGen/AllocationOrder.h
@@ -120,7 +120,7 @@ class LLVM_LIBRARY_VISIBILITY AllocationOrder {
     return Reg.isPhysical() && is_contained(Hints, Reg.id());
   }
   
-  /// Apply antihint to the allocation order.
+  /// Apply anti-hints to the allocation order.
   void applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs, 
                       const TargetRegisterInfo *TRI);
 

>From 512216771d41f77ce88a1245384c0363a575fa5e Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Mon, 22 Sep 2025 19:31:15 -0400
Subject: [PATCH 17/18] [AMDGPU] Anti-hints in register allocation

---
 .../include/llvm/CodeGen/MIRParser/MIParser.h |  2 +-
 llvm/include/llvm/CodeGen/MIRYamlMapping.h    |  5 +++--
 .../llvm/CodeGen/MachineRegisterInfo.h        | 19 +++++++++--------
 llvm/lib/CodeGen/AllocationOrder.cpp          | 21 +++++++++----------
 llvm/lib/CodeGen/AllocationOrder.h            |  5 ++---
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp      |  7 +++----
 llvm/lib/CodeGen/MachineRegisterInfo.cpp      | 12 +++++------
 7 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 1d0a745d5f983..cf7a56587397d 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -45,7 +45,7 @@ struct VRegInfo {
   } D;
   Register VReg;
   Register PreferredReg;
-  SmallVector<Register, 4> AntiHints;  // Anti-hints
+  SmallVector<Register, 4> AntiHints; // Anti-hints
   uint8_t Flags = 0;
 };
 
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index 3e4b57da91479..0698ff1248df0 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -210,9 +210,10 @@ template <> struct MappingTraits<VirtualRegisterDefinition> {
                        StringValue()); // Don't print out when it's empty.
     YamlIO.mapOptional("flags", Reg.RegisterFlags,
                        std::vector<FlowStringValue>());
-    if(!YamlIO.outputting() || !Reg.AntiHints.empty()) {  // Only map when parsing or anti-hints present
+    if (!YamlIO.outputting() ||
+        !Reg.AntiHints.empty()) { // Only map when parsing or anti-hints present
       YamlIO.mapOptional("anti-hints", Reg.AntiHints,
-                       std::vector<FlowStringValue>());  // for anti-hints
+                         std::vector<FlowStringValue>()); // for anti-hints
     }
   }
 
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index bcee5d6b30439..5f00aeebb46fe 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -880,20 +880,21 @@ class MachineRegisterInfo {
       AntiHints.push_back(AntiHintVReg);
   }
 
-  /// addRegAllocationAntiHint - Add multiple anti-hints at once
-  void addRegAllocationAntiHints(Register VReg, ArrayRef<Register> AntiHintVRegs) {
+  /// addRegAllocationAntiHint - Add multiple anti-hints at once.
+  void addRegAllocationAntiHints(Register VReg,
+                                 ArrayRef<Register> AntiHintVRegs) {
     for (Register AntiHint : AntiHintVRegs)
       setRegAllocationAntiHint(VReg, AntiHint);
   }
 
-  /// clearRegAllocationAntiHints - Clear all anti-hints for a register
+  /// clearRegAllocationAntiHints - Clear all anti-hints for a register.
   void clearRegAllocationAntiHints(Register VReg) {
     assert(VReg.isVirtual());
     if (AntiHintRegs.inBounds(VReg))
       AntiHintRegs[VReg].clear();
   }
 
-  /// getRegAllocationAntiHints - Return the vector of anti-hints for VReg
+  /// getRegAllocationAntiHints - Return the vector of anti-hints for VReg.
   ArrayRef<Register> getRegAllocationAntiHints(Register VReg) const {
     assert(VReg.isVirtual());
     if (!AntiHintRegs.inBounds(VReg))
@@ -901,7 +902,7 @@ class MachineRegisterInfo {
     return AntiHintRegs[VReg];
   }
 
-  /// hasRegAllocationAntiHint - Check if VReg has AntiHintVReg as an anti-hint
+  /// hasRegAllocationAntiHint - Check if VReg has AntiHintVReg as an anti-hint.
   bool hasRegAllocationAntiHint(Register VReg, Register AntiHintVReg) const {
     assert(VReg.isVirtual() && AntiHintVReg.isVirtual());
     if (!AntiHintRegs.inBounds(VReg))
@@ -910,11 +911,11 @@ class MachineRegisterInfo {
     return llvm::find(AntiHints, AntiHintVReg) != AntiHints.end();
   }
 
-  /// getPhysRegAntiHints - Get the set of physical registers to avoid based on
-  /// anti-hints and current allocations. This is called during allocation.
+  /// getPhysRegAntiHints - Get the set of physical registers to avoid.
   /// VRM is the current virtual register map showing allocations made so far.
-  void getPhysRegAntiHints(Register VReg, SmallVectorImpl<MCPhysReg> &PhysAntiHints,
-                          const VirtRegMap *VRM) const;
+  void getPhysRegAntiHints(Register VReg,
+                           SmallVectorImpl<MCPhysReg> &PhysAntiHints,
+                           const VirtRegMap *VRM) const;
 
   /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the
   /// specified register as undefined which causes the DBG_VALUE to be
diff --git a/llvm/lib/CodeGen/AllocationOrder.cpp b/llvm/lib/CodeGen/AllocationOrder.cpp
index f420c96e212d0..8550759f97e8a 100644
--- a/llvm/lib/CodeGen/AllocationOrder.cpp
+++ b/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -49,7 +49,7 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
   // Get anti-hints
   SmallVector<MCPhysReg, 16> AntiHintedPhysRegs;
   MRI.getPhysRegAntiHints(VirtReg, AntiHintedPhysRegs, &VRM);
-  
+
   LLVM_DEBUG({
     if (!AntiHintedPhysRegs.empty()) {
       dbgs() << "anti-hints:";
@@ -58,14 +58,14 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
       dbgs() << '\n';
     }
   });
-  
+
   // Create allocation order object
   AllocationOrder AO(std::move(Hints), Order, HardHints);
-  
+
   // Apply anti-hints filtering if needed
   if (!AntiHintedPhysRegs.empty()) {
     AO.applyAntiHints(AntiHintedPhysRegs, TRI);
-    
+
     LLVM_DEBUG({
       if (!AO.Hints.empty()) {
         dbgs() << "filtered hints:";
@@ -76,38 +76,37 @@ AllocationOrder AllocationOrder::create(Register VirtReg, const VirtRegMap &VRM,
     });
   }
 
-
   assert(all_of(AO.Hints,
                 [&](MCPhysReg Hint) { return is_contained(AO.Order, Hint); }) &&
          "Target hint is outside allocation order.");
   return AO;
 }
 
-void AllocationOrder::applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs, 
+void AllocationOrder::applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs,
                                      const TargetRegisterInfo *TRI) {
   // Create filtered order
   FilteredOrderStorage.clear();
   FilteredOrderStorage.reserve(Order.size());
-  
+
   // Add non-anti-hinted registers first
   for (MCPhysReg PhysReg : Order) {
     if (!is_contained(AntiHintedPhysRegs, PhysReg)) {
       FilteredOrderStorage.push_back(PhysReg);
     }
   }
-  
+
   // Add anti-hinted registers at the end as last resort
   for (MCPhysReg PhysReg : Order) {
     if (is_contained(AntiHintedPhysRegs, PhysReg)) {
       FilteredOrderStorage.push_back(PhysReg);
     }
   }
-  
+
   // Update Order
   Order = FilteredOrderStorage;
-  
+
   LLVM_DEBUG({
-    dbgs() << "moved " << AntiHintedPhysRegs.size() 
+    dbgs() << "moved " << AntiHintedPhysRegs.size()
            << " anti-hinted registers to end of allocation order\n";
   });
 }
diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h
index 029d9c83baf35..cda5fd08e0af6 100644
--- a/llvm/lib/CodeGen/AllocationOrder.h
+++ b/llvm/lib/CodeGen/AllocationOrder.h
@@ -119,11 +119,10 @@ class LLVM_LIBRARY_VISIBILITY AllocationOrder {
                static_cast<uint32_t>(std::numeric_limits<MCPhysReg>::max()));
     return Reg.isPhysical() && is_contained(Hints, Reg.id());
   }
-  
+
   /// Apply anti-hints to the allocation order.
-  void applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs, 
+  void applyAntiHints(ArrayRef<MCPhysReg> AntiHintedPhysRegs,
                       const TargetRegisterInfo *TRI);
-
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 1110823a2ca5a..8af521d6ab2c7 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -739,13 +739,12 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
     for (const auto &AntiHintValue : VReg.AntiHints) {
       if (Info.Kind != VRegInfo::NORMAL)
         return error(VReg.Class.SourceRange.Start,
-              Twine("anti-hints can only be set for normal vregs"));
+                     Twine("anti-hints can only be set for normal vregs"));
 
       Register AntiHintReg;
-      if (parseRegisterReference(PFS, AntiHintReg,
-                                 AntiHintValue.Value, Error))
+      if (parseRegisterReference(PFS, AntiHintReg, AntiHintValue.Value, Error))
         return error(Error, AntiHintValue.SourceRange);
-      
+
       Info.AntiHints.push_back(AntiHintReg);
     }
 
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index c169315c555d5..9bb7014403ad9 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -678,16 +678,16 @@ bool MachineRegisterInfo::isReservedRegUnit(unsigned Unit) const {
   return false;
 }
 
-void MachineRegisterInfo::getPhysRegAntiHints(Register VReg, 
-                                             SmallVectorImpl<MCPhysReg> &PhysAntiHints,
-                                             const VirtRegMap *VRM) const {
+void MachineRegisterInfo::getPhysRegAntiHints(
+    Register VReg, SmallVectorImpl<MCPhysReg> &PhysAntiHints,
+    const VirtRegMap *VRM) const {
   assert(VReg.isVirtual());
   if (!AntiHintRegs.inBounds(VReg) || !VRM)
     return;
-  
+
   const auto &AntiHints = AntiHintRegs[VReg];
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
-  
+
   for (Register AntiHintVReg : AntiHints) {
     // Check if the anti-hinted register has been allocated
     if (VRM->hasPhys(AntiHintVReg)) {
@@ -698,7 +698,7 @@ void MachineRegisterInfo::getPhysRegAntiHints(Register VReg,
       }
     }
   }
-  
+
   // Remove duplicates
   llvm::sort(PhysAntiHints);
   PhysAntiHints.erase(llvm::unique(PhysAntiHints), PhysAntiHints.end());

>From fc6b1f66c7c8ebbb0e50bb5c62ebe80276a43b6f Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Tue, 23 Sep 2025 11:48:50 -0400
Subject: [PATCH 18/18] Modified flag name to reflect anti-hints

---
 llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp   | 14 ++++++--------
 .../AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir  |  2 +-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 098ca1120c85c..0a08cbbdbf2dc 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -34,7 +34,6 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -44,11 +43,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
 
-static cl::opt<bool> EnableRegisterAvoidListForMFMARegs(
-    "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden,
-    cl::desc("Enable Register Avoidance for "
-             "MFMA in GCNPreRAOptimizations stage."),
-    cl::init(true));
+static cl::opt<bool>
+    EnableAntiHintsForMFMARegs("amdgpu-anti-hints-for-mfma", cl::Hidden,
+                               cl::desc("Enable Anti-Hints for "
+                                        "MFMA in GCNPreRAOptimizations stage."),
+                               cl::init(true));
 
 namespace {
 
@@ -256,10 +255,9 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
   bool Changed = false;
 
   // Single pass implementation
-  if (EnableRegisterAvoidListForMFMARegs && ST.hasMAIInsts()) {
+  if (EnableAntiHintsForMFMARegs && ST.hasMAIInsts()) {
     // Max lookback window for RAW or WAW hazard
     constexpr unsigned MaxLookbackWindow = 19;
-    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     for (const MachineBasicBlock &MBB : MF) {
 
       SmallVector<std::pair<SlotIndex, SmallVector<Register, 4>>, 16>
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
index d360eccaeb773..ba89b09539113 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.anti-hints.gfx942.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter %s -o - | FileCheck %s --check-prefix=CHECK
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck %s --check-prefix=CHECK-NO-ANTIHINT
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=amdgpu-pre-ra-optimizations,greedy,machineverifier,virtregrewriter -amdgpu-anti-hints-for-mfma=false %s -o - | FileCheck %s --check-prefix=CHECK-NO-ANTIHINT
 
 --- |
   target triple = "amdgcn-amd-amdhsa"



More information about the llvm-commits mailing list