[llvm] [AMDGPU] Improve register allocation to reduce MFMA hazard NOPs (PR #156943)

via llvm-commits llvm-commits at lists.llvm.org
Thu Sep 4 11:15:20 PDT 2025


https://github.com/mssefat created https://github.com/llvm/llvm-project/pull/156943

[AMDGPU] Improve register allocation to reduce MFMA hazard NOPs

Reduce unnecessary s_nop insertion for MFMA hazards by creating hints for register allocation.

When subsequent instructions such as ds_read, buffer_load, or other memory/VALU instructions 
follow MFMA instructions, the register allocator often reuses the same VGPRs that MFMA instructions 
used as destinations or C matrix operands. This reuse creates hazards, forcing the hazard
recognizer to insert s_nop instructions.

Example:
  v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[102:103], v[6:7], v[134:137]
  ...
  s_nop 5 ; <-- 
  ds_read_b128 v[26:29], v125 offset:10240

This patch introduces a two-phase register hint mechanism to reduce MFMA hazards:

1. In GCNPreRAOptimizations, track MFMA destination and accumulator registers, identify
   subsequent instructions' register usage that may trigger hazards, and create register avoid list in SIMachineFunctionInfo.
2. The register allocator uses this list to create hints by prioritizing registers that don't conflict with avoid registers.

>From ee1ade05012acf9c14bbe3d8852277deea0d9e58 Mon Sep 17 00:00:00 2001
From: mssefat <syadus.sefat at gmail.com>
Date: Thu, 4 Sep 2025 12:31:49 -0400
Subject: [PATCH] [AMDGPU] Improve register allocation to reduce MFMA hazard
 NOPs

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   |   94 +
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |   14 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |   32 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |    4 +-
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir |  515 +--
 .../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir |  542 +--
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll |  204 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 3662 ++++++++---------
 .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll    |   62 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll |  464 +--
 ...amdgcn.mfma.hint.hazard.barrier.gfx942.mir | 1292 ++++++
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  |  150 +-
 ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll |  276 +-
 .../AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll    |   12 +-
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       |  469 +--
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       |  126 +-
 .../unspill-vgpr-after-rewrite-vgpr-mfma.ll   |   56 +-
 17 files changed, 4650 insertions(+), 3324 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9485e4d..6d2b10bdb5804 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -34,6 +34,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -43,6 +44,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
 
+static cl::opt<bool> EnableRegisterAvoidListForMFMARegs(
+    "amdgpu-avoid-hazard-hint-for-mfma", cl::Hidden,
+    cl::desc("Enable Register Avoidance for "
+             "MFMA in GCNPreRAOptimizations stage."),
+    cl::init(true));
+
 namespace {
 
 class GCNPreRAOptimizationsImpl {
@@ -248,6 +255,93 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
 
   bool Changed = false;
 
+  // Single pass implementation
+  if (EnableRegisterAvoidListForMFMARegs && ST.hasMAIInsts()) {
+    // Max lookback window for RAW or WAW hazard
+    constexpr unsigned MaxLookbackWindow = 19;
+    SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    for (const MachineBasicBlock &MBB : MF) {
+
+      SmallVector<std::pair<SlotIndex, SmallVector<Register, 4>>, 16>
+          RecentMFMAs;
+      for (const MachineInstr &MI : MBB) {
+        if (MI.isDebugInstr())
+          continue;
+        const SlotIndex CurrentSlot = LIS->getInstructionIndex(MI).getRegSlot();
+        // Handle MFMA instructions
+        if (SIInstrInfo::isMFMA(MI)) {
+          SmallVector<Register, 4> MFMARegisters;
+          auto collectMFMARegister = [&](unsigned OpIdx) {
+            if (OpIdx >= MI.getNumOperands())
+              return;
+
+            const MachineOperand &MO = MI.getOperand(OpIdx);
+            if (MO.isReg() && MO.getReg().isVirtual())
+              MFMARegisters.push_back(MO.getReg());
+          };
+          // Only collect Matrix C (operand 3) and destination (operand 0)
+          // registers
+          collectMFMARegister(0);
+          collectMFMARegister(3);
+
+          if (!MFMARegisters.empty()) {
+            RecentMFMAs.emplace_back(CurrentSlot, std::move(MFMARegisters));
+            // Maintain window
+            if (RecentMFMAs.size() > MaxLookbackWindow)
+              RecentMFMAs.erase(RecentMFMAs.begin());
+          }
+          continue;
+        }
+        bool ShouldCheckReuse = MI.mayLoad() || MI.mayStore() || MI.isCopy() ||
+                                SIInstrInfo::isVALU(MI);
+        // Skip non-relevant instructions, or skip until at least one MFMA is
+        // encountered
+        if (!ShouldCheckReuse || RecentMFMAs.empty())
+          continue;
+
+        // Process operands that might reuse MFMA registers
+        for (const MachineOperand &MO : MI.operands()) {
+          if (!MO.isReg() || !MO.getReg().isVirtual())
+            continue;
+
+          const Register CandidateReg = MO.getReg();
+          const TargetRegisterClass *CandidateRC =
+              MRI->getRegClass(CandidateReg);
+
+          // Only process VGPR registers
+          if (!TRI->isVGPRClass(CandidateRC))
+            continue;
+
+          for (auto It = RecentMFMAs.rbegin(); It != RecentMFMAs.rend(); ++It) {
+            const SmallVector<Register, 4> &MFMARegs = It->second;
+            for (Register MFMAReg : MFMARegs) {
+              // Verify register class compatibility
+              const TargetRegisterClass *MFMARC = MRI->getRegClass(MFMAReg);
+              if (!TRI->hasVGPRs(MFMARC))
+                continue;
+
+              // Check if MFMA register is dead at current instruction
+              const LiveInterval &MFMAInterval = LIS->getInterval(MFMAReg);
+              if (!MFMAInterval.liveAt(CurrentSlot)) {
+
+                // Add bidirectional avoidance hint
+                MFI->addRegisterToAvoid(CandidateReg, MFMAReg);
+                MFI->addRegisterToAvoid(MFMAReg, CandidateReg);
+
+                // Set hint if we found registers to avoid
+                MRI->setRegAllocationHint(
+                    MFMAReg, AMDGPURI::HasRegisterAvoidanceList, Register());
+                MRI->setRegAllocationHint(CandidateReg,
+                                          AMDGPURI::HasRegisterAvoidanceList,
+                                          Register());
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
   for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
     if (!LIS->hasInterval(Reg))
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ca8f8033a2d54..17fb1f2a2db04 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1207,6 +1207,20 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   unsigned getMaxNumWorkGroupsX() const { return MaxNumWorkGroups[0]; }
   unsigned getMaxNumWorkGroupsY() const { return MaxNumWorkGroups[1]; }
   unsigned getMaxNumWorkGroupsZ() const { return MaxNumWorkGroups[2]; }
+
+  // Map of registers to avoid for a given register
+  DenseMap<Register, SmallVector<Register, 8>> RegisterAvoidanceMap;
+
+  void addRegisterToAvoid(Register VirtReg, Register AvoidReg) {
+    RegisterAvoidanceMap[VirtReg].push_back(AvoidReg);
+  }
+
+  ArrayRef<Register> getRegistersToAvoid(Register VirtReg) const {
+    auto It = RegisterAvoidanceMap.find(VirtReg);
+    if (It != RegisterAvoidanceMap.end())
+      return It->second;
+    return ArrayRef<Register>();
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index a1fcf26eab27b..61c4f19c7111a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3838,6 +3838,38 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
     }
     return false;
   }
+  case AMDGPURI::HasRegisterAvoidanceList: {
+    const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    ArrayRef<Register> AvoidRegs = MFI->getRegistersToAvoid(VirtReg);
+
+    if (AvoidRegs.empty())
+      return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+                                                       MF, VRM);
+    // Collect physical registers to avoid
+    SmallSet<MCPhysReg, 32> AvoidPhysRegs;
+    for (Register AvoidReg : AvoidRegs) {
+      if (VRM && VRM->hasPhys(AvoidReg)) {
+        // Virtual register already mapped - try to avoid its physical register
+        MCPhysReg AvoidPhys = VRM->getPhys(AvoidReg);
+        for (MCRegAliasIterator AI(AvoidPhys, this, true); AI.isValid(); ++AI)
+          AvoidPhysRegs.insert(*AI);
+      }
+    }
+
+    if (AvoidPhysRegs.empty()) {
+      // No physical registers added yet - use default order
+      return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints,
+                                                       MF, VRM);
+    }
+
+    // Prioritize registers that don't conflict with avoided registers
+    for (MCPhysReg PhysReg : Order) {
+      if (!AvoidPhysRegs.count(PhysReg) && !MRI.isReserved(PhysReg))
+        Hints.push_back(PhysReg);
+    }
+
+    return false;
+  }
   default:
     return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
                                                      VRM);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index eeefef1116aa3..8f1ca4f18afb9 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -31,9 +31,11 @@ class RegisterBank;
 struct SGPRSpillBuilder;
 
 /// Register allocation hint types. Helps eliminate unneeded COPY with True16
+/// HasRegisterAvoidanceList helps with minimizing usage of conflicting physical
+/// registers
 namespace AMDGPURI {
 
-enum { Size16 = 1, Size32 = 2 };
+enum { Size16 = 1, Size32 = 2, HasRegisterAvoidanceList = 3 };
 
 } // end namespace AMDGPURI
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index aad6e031aa9ed..3996a94e0347e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -15,9 +15,12 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr106
   ; GCN-NEXT:    ; implicit-def: $vgpr132
+  ; GCN-NEXT:    ; implicit-def: $vgpr112
+  ; GCN-NEXT:    ; implicit-def: $vgpr113
+  ; GCN-NEXT:    ; implicit-def: $vgpr114
+  ; GCN-NEXT:    ; implicit-def: $vgpr115
   ; GCN-NEXT:    ; implicit-def: $vgpr133
   ; GCN-NEXT:    ; implicit-def: $vgpr139
-  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -167,46 +170,45 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b128 v95, v[68:71] offset:1024
-  ; GCN-NEXT:    ; implicit-def: $vgpr64
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
-  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
-  ; GCN-NEXT:    ; implicit-def: $vgpr73
-  ; GCN-NEXT:    v_add_u32_e32 v76, v132, v64
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_load_dwordx4 v[64:67], v92, s[8:11], 0 offen offset:192 sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[72:73], v[76:77], v[0:15]
+  ; GCN-NEXT:    v_add_u32_e32 v72, 0xc0, v93
+  ; GCN-NEXT:    v_add_u32_e32 v73, v132, v112
   ; GCN-NEXT:    buffer_load_dwordx4 v[68:71], v72, s[8:11], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; kill: killed $vgpr72
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v73
-  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v76, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v113
+  ; GCN-NEXT:    buffer_load_dwordx2 v[98:99], v73, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    buffer_load_dwordx2 v[102:103], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
-  ; GCN-NEXT:    ; implicit-def: $vgpr74
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v74
-  ; GCN-NEXT:    ; implicit-def: $vgpr75
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v114
   ; GCN-NEXT:    buffer_load_dwordx2 v[100:101], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v75
+  ; GCN-NEXT:    v_add_u32_e32 v72, v132, v115
   ; GCN-NEXT:    buffer_load_dwordx2 v[104:105], v72, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[74:75], v[78:79], v[0:15]
+  ; GCN-NEXT:    ; kill: killed $vgpr73
   ; GCN-NEXT:    ds_read_b128 v[72:75], v94
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    ; implicit-def: $vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ; implicit-def: $sgpr8
+  ; GCN-NEXT:    ; implicit-def: $vgpr112
+  ; GCN-NEXT:    ; implicit-def: $vgpr113
+  ; GCN-NEXT:    ; implicit-def: $vgpr114
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[72:73], v[76:77], v[48:63]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[74:75], v[78:79], v[48:63]
   ; GCN-NEXT:    ds_read_b128 v[72:75], v94 offset:512
@@ -411,8 +413,6 @@
   ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v66
   ; GCN-NEXT:    ; implicit-def: $vgpr65
   ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    ; implicit-def: $vgpr68
-  ; GCN-NEXT:    ; implicit-def: $vgpr67
   ; GCN-NEXT:    v_add_u32_e32 v65, s7, v65
   ; GCN-NEXT:    v_and_b32_e32 v65, 0x1fffffff, v65
   ; GCN-NEXT:    v_mul_lo_u32 v65, v65, s6
@@ -440,40 +440,36 @@
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v138, v[96:97]
-  ; GCN-NEXT:    v_add_u32_e32 v68, v132, v68
+  ; GCN-NEXT:    ; implicit-def: $vgpr96
   ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[6:7]
   ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
   ; GCN-NEXT:    ; implicit-def: $vgpr65
   ; GCN-NEXT:    v_max_f32_e32 v66, v65, v65
   ; GCN-NEXT:    v_max_f32_e32 v134, v66, v64
-  ; GCN-NEXT:    ; implicit-def: $vgpr64
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v96
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[156:157], v68, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[160:161], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v64
-  ; GCN-NEXT:    buffer_load_dwordx2 v[158:159], v64, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v112
+  ; GCN-NEXT:    buffer_load_dwordx2 v[162:163], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ; implicit-def: $vgpr66
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v66
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v113
   ; GCN-NEXT:    buffer_load_dwordx2 v[128:129], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v67
+  ; GCN-NEXT:    v_add_u32_e32 v64, v132, v114
   ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
   ; GCN-NEXT:    v_fma_f32 v48, s4, v48, -v134
-  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v57, -v134
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
   ; GCN-NEXT:    v_fma_f32 v64, s4, v49, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v163, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v57
   ; GCN-NEXT:    v_fma_f32 v66, s4, v50, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v164, v57
+  ; GCN-NEXT:    v_exp_f32_e32 v165, v57
   ; GCN-NEXT:    v_exp_f32_e32 v49, v48
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v64
   ; GCN-NEXT:    v_fma_f32 v67, s4, v51, -v134
@@ -499,31 +495,30 @@
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v70
   ; GCN-NEXT:    v_exp_f32_e32 v55, v48
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v71
-  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_fma_f32 v66, s4, v56, -v134
   ; GCN-NEXT:    v_exp_f32_e32 v56, v48
   ; GCN-NEXT:    v_sub_f32_e32 v48, v65, v134
+  ; GCN-NEXT:    ds_read_b128 v[144:147], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v64, v49
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v67, v50
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v51
+  ; GCN-NEXT:    v_fma_f32 v96, s4, v58, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v58, v52
   ; GCN-NEXT:    v_mul_f32_e32 v48, 0x3fb8aa3b, v48
   ; GCN-NEXT:    ds_read_b128 v[148:151], v139 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_exp_f32_e32 v48, v48
-  ; GCN-NEXT:    v_pack_b32_f16 v161, v68, v58
-  ; GCN-NEXT:    v_pack_b32_f16 v160, v64, v67
-  ; GCN-NEXT:    v_mul_f32_e32 v58, 0x3fb8aa3b, v66
+  ; GCN-NEXT:    v_fma_f32 v156, s4, v59, -v134
+  ; GCN-NEXT:    v_pack_b32_f16 v59, v68, v58
+  ; GCN-NEXT:    v_pack_b32_f16 v58, v64, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v80, 0x3fb8aa3b, v66
   ; GCN-NEXT:    ; implicit-def: $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79
   ; GCN-NEXT:    ds_read_b128 v[152:155], v139 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v162, s4, v61, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v55
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v57, v56
   ; GCN-NEXT:    v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
@@ -532,10 +527,14 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v96
+  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+  ; GCN-NEXT:    v_fma_f32 v157, s4, v60, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[58:59], v[64:79]
+  ; GCN-NEXT:    v_exp_f32_e32 v141, v80
   ; GCN-NEXT:    ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
-  ; GCN-NEXT:    v_fma_f32 v59, s4, v59, -v134
+  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
   ; GCN-NEXT:    v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
@@ -543,10 +542,6 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
-  ; GCN-NEXT:    v_exp_f32_e32 v58, v58
-  ; GCN-NEXT:    v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
   ; GCN-NEXT:    v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
@@ -554,258 +549,263 @@
   ; GCN-NEXT:    v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pack_b32_f16 v145, v61, v57
-  ; GCN-NEXT:    v_mul_f32_e32 v57, 0x3fb8aa3b, v59
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v140, v53
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v141, v54
-  ; GCN-NEXT:    v_exp_f32_e32 v59, v57
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
-  ; GCN-NEXT:    v_fma_f32 v60, s4, v60, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[58:59], v[80:95]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v144, v54
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v145, v55
+  ; GCN-NEXT:    v_exp_f32_e32 v167, v57
+  ; GCN-NEXT:    ; implicit-def: $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127
+  ; GCN-NEXT:    v_mul_f32_e32 v168, 0x3fb8aa3b, v157
   ; GCN-NEXT:    v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[58:59], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v148, v56
   ; GCN-NEXT:    v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
   ; GCN-NEXT:    v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_fma_f32 v148, s4, v62, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v144, v140, v141
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v149, v145, v148
+  ; GCN-NEXT:    v_pack_b32_f16 v148, v140, v144
+  ; GCN-NEXT:    v_mul_f32_e32 v140, 0x3fb8aa3b, v156
+  ; GCN-NEXT:    v_exp_f32_e32 v168, v168
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[58:59], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v153, v140
+  ; GCN-NEXT:    ; implicit-def: $vgpr140
+  ; GCN-NEXT:    v_fma_f32 v164, s4, v61, -v134
+  ; GCN-NEXT:    v_fma_f32 v166, s4, v62, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v169, v141
   ; GCN-NEXT:    v_fma_f32 v152, s4, v63, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v149, 0x3fb8aa3b, v60
-  ; GCN-NEXT:    ; implicit-def: $vgpr57
-  ; GCN-NEXT:    ds_read_b128 v[60:63], v57
+  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v35, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[148:149], v[64:79]
+  ; GCN-NEXT:    ds_read_b128 v[142:145], v140
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v160, v149
-  ; GCN-NEXT:    v_fma_f32 v161, s4, v33, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v148
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v153, v58
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
-  ; GCN-NEXT:    v_fma_f32 v32, s4, v32, -v134
-  ; GCN-NEXT:    ds_read_b128 v[140:143], v57 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[156:159], v140 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_fma_f32 v40, s4, v40, -v134
   ; GCN-NEXT:    v_fma_f32 v44, s4, v44, -v134
   ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v134
-  ; GCN-NEXT:    v_fma_f32 v166, s4, v20, -v134
   ; GCN-NEXT:    v_fma_f32 v24, s4, v24, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
-  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v162
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v163
-  ; GCN-NEXT:    v_exp_f32_e32 v162, v146
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v164
   ; GCN-NEXT:    v_fma_f32 v28, s4, v28, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v148, v153, v147
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[148:149], v[80:95]
+  ; GCN-NEXT:    v_mul_f32_e32 v146, 0x3fb8aa3b, v164
+  ; GCN-NEXT:    v_fma_f32 v164, s4, v33, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v166
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v147, v165
+  ; GCN-NEXT:    v_exp_f32_e32 v170, v146
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v146, v167
   ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[148:149], v[96:111]
   ; GCN-NEXT:    v_exp_f32_e32 v151, v33
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v59
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v153
+  ; GCN-NEXT:    v_pack_b32_f16 v62, v169, v147
   ; GCN-NEXT:    v_fma_f32 v150, s4, v34, -v134
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
-  ; GCN-NEXT:    v_pack_b32_f16 v149, v146, v33
+  ; GCN-NEXT:    v_perm_b32 v147, v131, v129, s8
+  ; GCN-NEXT:    v_pack_b32_f16 v63, v146, v33
   ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v152
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
-  ; GCN-NEXT:    v_fma_f32 v152, s4, v35, -v134
-  ; GCN-NEXT:    v_exp_f32_e32 v153, v33
-  ; GCN-NEXT:    v_fma_f32 v155, s4, v36, -v134
-  ; GCN-NEXT:    v_perm_b32 v36, v158, v156, s5
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v154, v160
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v60, 0x3fb8aa3b, v32
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1152
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[144:147], v57 offset:1728
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v61, 0x3fb8aa3b, v161
-  ; GCN-NEXT:    v_exp_f32_e32 v165, v60
-  ; GCN-NEXT:    v_perm_b32 v60, v158, v156, s8
-  ; GCN-NEXT:    v_fma_f32 v158, s4, v37, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[140:141], v[148:149], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v161, v61
-  ; GCN-NEXT:    v_perm_b32 v140, v159, v157, s8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[148:149], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v148, v33
+  ; GCN-NEXT:    v_fma_f32 v152, s4, v36, -v134
+  ; GCN-NEXT:    v_perm_b32 v36, v162, v160, s5
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v149, v168
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v155, v170
+  ; GCN-NEXT:    v_perm_b32 v146, v163, v161, s8
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[62:63], v[64:79]
+  ; GCN-NEXT:    v_mul_f32_e32 v142, 0x3fb8aa3b, v32
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v140 offset:1152
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    ds_read_b128 v[58:61], v140 offset:1728
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mul_f32_e32 v143, 0x3fb8aa3b, v164
+  ; GCN-NEXT:    v_exp_f32_e32 v154, v142
+  ; GCN-NEXT:    v_perm_b32 v142, v162, v160, s8
+  ; GCN-NEXT:    v_fma_f32 v160, s4, v38, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[156:157], v[62:63], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v157, v143
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v148
+  ; GCN-NEXT:    v_fma_f32 v156, s4, v37, -v134
   ; GCN-NEXT:    v_perm_b32 v37, v130, v128, s5
-  ; GCN-NEXT:    v_perm_b32 v61, v130, v128, s8
-  ; GCN-NEXT:    v_perm_b32 v141, v131, v129, s8
+  ; GCN-NEXT:    v_perm_b32 v143, v130, v128, s8
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b64 v135, v[36:37]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[148:149], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v32, v159, v157, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[62:63], v[96:111]
   ; GCN-NEXT:    v_mul_f32_e32 v33, 0x3fb8aa3b, v150
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v150, v151
-  ; GCN-NEXT:    v_fma_f32 v157, s4, v38, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v38, v153
-  ; GCN-NEXT:    v_exp_f32_e32 v159, v33
+  ; GCN-NEXT:    v_perm_b32 v32, v163, v161, s5
+  ; GCN-NEXT:    v_exp_f32_e32 v161, v33
   ; GCN-NEXT:    v_perm_b32 v33, v131, v129, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[144:145], v[148:149], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v129, v150, v38
-  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v152
-  ; GCN-NEXT:    v_exp_f32_e32 v152, v38
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v136, v[60:61]
+  ; GCN-NEXT:    ds_write_b64 v136, v[142:143]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v137, v[32:33]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[58:59], v[62:63], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v59, v150, v38
+  ; GCN-NEXT:    v_mul_f32_e32 v38, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_pack_b32_f16 v58, v149, v155
+  ; GCN-NEXT:    v_exp_f32_e32 v149, v38
   ; GCN-NEXT:    ; implicit-def: $vgpr33
   ; GCN-NEXT:    ; implicit-def: $vgpr38
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b64 v138, v[140:141]
+  ; GCN-NEXT:    ds_write_b64 v138, v[146:147]
   ; GCN-NEXT:    v_add_u32_e32 v38, v132, v38
   ; GCN-NEXT:    v_add_u32_e32 v33, v132, v33
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[130:131], v38, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[62:63], v38, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[140:141], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[142:143], v33, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; implicit-def: $vgpr36
   ; GCN-NEXT:    v_add_u32_e32 v33, v132, v36
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[144:145], v[58:59], v[64:79]
   ; GCN-NEXT:    ; implicit-def: $vgpr37
   ; GCN-NEXT:    buffer_load_dwordx2 v[144:145], v33, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_add_u32_e32 v33, v132, v37
-  ; GCN-NEXT:    buffer_load_dwordx2 v[148:149], v33, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[146:147], v33, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v156, v162
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v152
+  ; GCN-NEXT:    v_exp_f32_e32 v150, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v165
-  ; GCN-NEXT:    v_pack_b32_f16 v128, v154, v156
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v39, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[158:159], v[58:59], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v160
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v154
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v152, v157
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v39, -v134
   ; GCN-NEXT:    ds_read_b128 v[36:39], v139
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[62:63], v[128:129], v[64:79]
-  ; GCN-NEXT:    v_exp_f32_e32 v154, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
-  ; GCN-NEXT:    ds_read_b128 v[60:63], v139 offset:576
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v156, s4, v42, -v134
-  ; GCN-NEXT:    v_perm_b32 v20, v140, v130, s5
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[142:143], v[128:129], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v155, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v157
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v142, v161
-  ; GCN-NEXT:    v_fma_f32 v143, s4, v41, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[128:129], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v159
-  ; GCN-NEXT:    v_exp_f32_e32 v157, v32
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v152
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[146:147], v[128:129], v[112:127]
-  ; GCN-NEXT:    v_pack_b32_f16 v129, v34, v32
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_pack_b32_f16 v128, v33, v142
-  ; GCN-NEXT:    v_exp_f32_e32 v146, v32
+  ; GCN-NEXT:    ds_read_b128 v[128:131], v139 offset:576
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    buffer_inv sc0 sc1
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[58:59], v[96:111]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v161
+  ; GCN-NEXT:    v_exp_f32_e32 v159, v32
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v149
+  ; GCN-NEXT:    v_fma_f32 v155, s4, v41, -v134
+  ; GCN-NEXT:    v_fma_f32 v158, s4, v42, -v134
+  ; GCN-NEXT:    v_fma_f32 v162, s4, v20, -v134
+  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[60:61], v[58:59], v[112:127]
+  ; GCN-NEXT:    v_pack_b32_f16 v59, v34, v32
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_pack_b32_f16 v58, v33, v152
+  ; GCN-NEXT:    v_exp_f32_e32 v60, v32
   ; GCN-NEXT:    ds_read_b128 v[32:35], v139 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_fma_f32 v142, s4, v43, -v134
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v46, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[128:129], v[64:79]
+  ; GCN-NEXT:    v_fma_f32 v57, s4, v43, -v134
+  ; GCN-NEXT:    v_perm_b32 v20, v142, v62, s5
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[58:59], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v40
   ; GCN-NEXT:    ds_read_b128 v[40:43], v139 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v147, v36
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v143
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v154
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[60:61], v[128:129], v[80:95]
-  ; GCN-NEXT:    v_exp_f32_e32 v143, v36
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v155
-  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v142
-  ; GCN-NEXT:    v_fma_f32 v61, s4, v45, -v134
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[128:129], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v156
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v157
-  ; GCN-NEXT:    v_exp_f32_e32 v156, v32
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v146
+  ; GCN-NEXT:    v_exp_f32_e32 v61, v36
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v150
+  ; GCN-NEXT:    v_fma_f32 v155, s4, v46, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[128:129], v[58:59], v[80:95]
+  ; GCN-NEXT:    v_exp_f32_e32 v152, v36
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v128, v156
+  ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v57
+  ; GCN-NEXT:    v_fma_f32 v129, s4, v45, -v134
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[32:33], v[58:59], v[96:111]
+  ; GCN-NEXT:    v_mul_f32_e32 v32, 0x3fb8aa3b, v158
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v33, v159
+  ; GCN-NEXT:    v_exp_f32_e32 v158, v32
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v32, v60
   ; GCN-NEXT:    v_pack_b32_f16 v33, v33, v32
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v60
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[128:129], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v129, v36
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v37, v128
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[40:41], v[58:59], v[112:127]
+  ; GCN-NEXT:    v_exp_f32_e32 v57, v36
   ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v44
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v60, v147
-  ; GCN-NEXT:    v_fma_f32 v128, s4, v47, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v59, v61
+  ; GCN-NEXT:    v_fma_f32 v58, s4, v47, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v57
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v140
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v142, v40
-  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v61
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v143
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v57 offset:576
+  ; GCN-NEXT:    v_exp_f32_e32 v128, v40
+  ; GCN-NEXT:    v_mul_f32_e32 v40, 0x3fb8aa3b, v129
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v129, v152
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v140 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[62:63], v[32:33], v[80:95]
-  ; GCN-NEXT:    v_fma_f32 v62, s4, v17, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_exp_f32_e32 v63, v40
-  ; GCN-NEXT:    v_pack_b32_f16 v40, v60, v61
-  ; GCN-NEXT:    v_fma_f32 v150, s4, v18, -v134
-  ; GCN-NEXT:    v_fma_f32 v60, s4, v19, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v61, v142
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[130:131], v[32:33], v[80:95]
+  ; GCN-NEXT:    v_fma_f32 v130, s4, v17, -v134
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    v_exp_f32_e32 v131, v40
+  ; GCN-NEXT:    v_pack_b32_f16 v40, v59, v129
+  ; GCN-NEXT:    v_fma_f32 v155, s4, v18, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v59, v128
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[34:35], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v156
-  ; GCN-NEXT:    v_exp_f32_e32 v158, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v129
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v158
+  ; GCN-NEXT:    v_exp_f32_e32 v160, v17
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v57
   ; GCN-NEXT:    v_pack_b32_f16 v41, v34, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v128
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v58
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[42:43], v[32:33], v[112:127]
-  ; GCN-NEXT:    v_exp_f32_e32 v128, v17
-  ; GCN-NEXT:    v_perm_b32 v42, v141, v131, s8
-  ; GCN-NEXT:    v_perm_b32 v43, v149, v145, s8
+  ; GCN-NEXT:    v_fma_f32 v58, s4, v19, -v134
+  ; GCN-NEXT:    v_exp_f32_e32 v129, v17
+  ; GCN-NEXT:    v_perm_b32 v42, v143, v63, s8
+  ; GCN-NEXT:    v_perm_b32 v43, v147, v145, s8
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[36:37], v[40:41], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v36, 0x3fb8aa3b, v16
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v140 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[32:35], v57 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[32:35], v140 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v62
-  ; GCN-NEXT:    v_exp_f32_e32 v167, v36
-  ; GCN-NEXT:    v_perm_b32 v36, v140, v130, s8
+  ; GCN-NEXT:    v_mul_f32_e32 v37, 0x3fb8aa3b, v130
+  ; GCN-NEXT:    v_exp_f32_e32 v163, v36
+  ; GCN-NEXT:    v_perm_b32 v36, v142, v62, s8
   ; GCN-NEXT:    v_fma_f32 v62, s4, v21, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[44:45], v[40:41], v[80:95]
   ; GCN-NEXT:    v_exp_f32_e32 v130, v37
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v158
-  ; GCN-NEXT:    v_perm_b32 v21, v148, v144, s5
-  ; GCN-NEXT:    v_perm_b32 v37, v148, v144, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v63
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v45, v160
+  ; GCN-NEXT:    v_perm_b32 v21, v146, v144, s5
+  ; GCN-NEXT:    v_perm_b32 v37, v146, v144, s8
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v44, v131
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b64 v135, v[20:21]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
-  ; GCN-NEXT:    v_perm_b32 v16, v141, v131, s5
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v22, -v134
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v128
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v150
-  ; GCN-NEXT:    v_exp_f32_e32 v140, v17
-  ; GCN-NEXT:    v_perm_b32 v17, v149, v145, s5
+  ; GCN-NEXT:    v_perm_b32 v16, v143, v63, s5
+  ; GCN-NEXT:    v_fma_f32 v63, s4, v22, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v129
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v155
+  ; GCN-NEXT:    v_exp_f32_e32 v142, v17
+  ; GCN-NEXT:    v_perm_b32 v17, v147, v145, s5
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v136, v[36:37]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
   ; GCN-NEXT:    v_pack_b32_f16 v33, v45, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v60
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v58
   ; GCN-NEXT:    v_exp_f32_e32 v144, v22
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -828,22 +828,22 @@
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_add_u32_e32 v20, v132, v20
   ; GCN-NEXT:    v_add_u32_e32 v21, v132, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v32, v61, v44
+  ; GCN-NEXT:    v_pack_b32_f16 v32, v59, v44
   ; GCN-NEXT:    buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[60:61], v21, s[0:3], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[58:59], v21, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v166
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v162
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[38:39], v[32:33], v[64:79]
   ; GCN-NEXT:    v_exp_f32_e32 v132, v16
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v62
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v167
-  ; GCN-NEXT:    v_fma_f32 v141, s4, v23, -v134
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v163
+  ; GCN-NEXT:    v_fma_f32 v143, s4, v23, -v134
   ; GCN-NEXT:    ds_read_b128 v[20:23], v139
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -852,20 +852,20 @@
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[46:47], v[32:33], v[80:95]
   ; GCN-NEXT:    v_exp_f32_e32 v62, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v63
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v46, v130
   ; GCN-NEXT:    v_fma_f32 v47, s4, v25, -v134
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v26, -v134
-  ; GCN-NEXT:    v_fma_f32 v149, s4, v4, -v134
+  ; GCN-NEXT:    v_fma_f32 v63, s4, v26, -v134
+  ; GCN-NEXT:    v_fma_f32 v147, s4, v4, -v134
   ; GCN-NEXT:    ; implicit-def: $sgpr0
   ; GCN-NEXT:    v_perm_b32 v4, v42, v40, s5
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v140
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v142
   ; GCN-NEXT:    v_exp_f32_e32 v145, v16
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v144
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[34:35], v[32:33], v[112:127]
   ; GCN-NEXT:    v_pack_b32_f16 v33, v18, v16
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v141
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v143
   ; GCN-NEXT:    v_pack_b32_f16 v32, v17, v46
   ; GCN-NEXT:    v_exp_f32_e32 v35, v16
   ; GCN-NEXT:    ds_read_b128 v[16:19], v139 offset:1152
@@ -887,11 +887,11 @@
   ; GCN-NEXT:    v_fma_f32 v37, s4, v29, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v34, v46
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[32:33], v[96:111]
-  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v63
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v145
-  ; GCN-NEXT:    v_exp_f32_e32 v141, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v143, v16
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v35
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v30, -v134
+  ; GCN-NEXT:    v_fma_f32 v63, s4, v30, -v134
   ; GCN-NEXT:    v_pack_b32_f16 v17, v17, v16
   ; GCN-NEXT:    v_pack_b32_f16 v16, v21, v36
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[112:127], v[24:25], v[32:33], v[112:127]
@@ -899,25 +899,25 @@
   ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v28
   ; GCN-NEXT:    v_fma_f32 v32, s4, v31, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[20:23], v57
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v140
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_exp_f32_e32 v36, v24
   ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v37
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v47
-  ; GCN-NEXT:    ds_read_b128 v[28:31], v57 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[28:31], v140 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[38:39], v[16:17], v[80:95]
   ; GCN-NEXT:    v_fma_f32 v38, s4, v1, -v134
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v63
   ; GCN-NEXT:    v_exp_f32_e32 v39, v24
   ; GCN-NEXT:    v_pack_b32_f16 v24, v34, v37
-  ; GCN-NEXT:    v_fma_f32 v131, s4, v2, -v134
+  ; GCN-NEXT:    v_fma_f32 v63, s4, v2, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v37, v36
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[18:19], v[16:17], v[96:111]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v141
-  ; GCN-NEXT:    v_exp_f32_e32 v148, v1
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v143
+  ; GCN-NEXT:    v_exp_f32_e32 v146, v1
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v33
   ; GCN-NEXT:    v_pack_b32_f16 v25, v18, v1
   ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v32
@@ -925,25 +925,25 @@
   ; GCN-NEXT:    v_fma_f32 v32, s4, v3, -v134
   ; GCN-NEXT:    v_exp_f32_e32 v34, v1
   ; GCN-NEXT:    v_perm_b32 v26, v43, v41, s8
-  ; GCN-NEXT:    v_perm_b32 v27, v61, v45, s8
+  ; GCN-NEXT:    v_perm_b32 v27, v59, v45, s8
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[20:21], v[24:25], v[64:79]
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v140 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[16:19], v57 offset:1728
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v140 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v38
-  ; GCN-NEXT:    v_exp_f32_e32 v150, v20
+  ; GCN-NEXT:    v_exp_f32_e32 v155, v20
   ; GCN-NEXT:    v_perm_b32 v20, v42, v40, s8
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v148
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v40, v146
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[28:29], v[24:25], v[80:95]
   ; GCN-NEXT:    v_exp_f32_e32 v38, v21
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v39
   ; GCN-NEXT:    v_fma_f32 v29, s4, v5, -v134
-  ; GCN-NEXT:    v_perm_b32 v5, v60, v44, s5
-  ; GCN-NEXT:    v_perm_b32 v21, v60, v44, s8
+  ; GCN-NEXT:    v_perm_b32 v5, v58, v44, s5
+  ; GCN-NEXT:    v_perm_b32 v21, v58, v44, s8
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
@@ -953,9 +953,9 @@
   ; GCN-NEXT:    v_perm_b32 v0, v43, v41, s5
   ; GCN-NEXT:    v_fma_f32 v41, s4, v6, -v134
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v34
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v131
+  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v63
   ; GCN-NEXT:    v_exp_f32_e32 v42, v1
-  ; GCN-NEXT:    v_perm_b32 v1, v61, v45, s5
+  ; GCN-NEXT:    v_perm_b32 v1, v59, v45, s5
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    ds_write_b64 v136, v[20:21]
@@ -979,10 +979,10 @@
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
-  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v149
+  ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v147
   ; GCN-NEXT:    v_exp_f32_e32 v26, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v29
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v150
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v155
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v27, v38
   ; GCN-NEXT:    ds_read_b128 v[20:23], v139 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1034,10 +1034,10 @@
   ; GCN-NEXT:    v_exp_f32_e32 v21, v9
   ; GCN-NEXT:    v_fma_f32 v8, s4, v15, -v134
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v57
+  ; GCN-NEXT:    ds_read_b128 v[4:7], v140
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[12:15], v57 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[12:15], v140 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v24
@@ -1063,33 +1063,33 @@
   ; GCN-NEXT:    v_add_f32_e32 v3, v54, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v55, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v56, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v58, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v164, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v59, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v162, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v151, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v165, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v161, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v159, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v152, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v153, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v168, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v170, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v151, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v154, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v157, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v147, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v161, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v149, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v156, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v63, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v159, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v60, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v61, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v152, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v158, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v57, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v128, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v167, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v131, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v160, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v129, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v163, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v130, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v140, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v142, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v144, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v132, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v62, v3
@@ -1097,14 +1097,14 @@
   ; GCN-NEXT:    v_add_f32_e32 v3, v35, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v46, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v47, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v141, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v143, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v33, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v36, v3
   ; GCN-NEXT:    v_add_f32_e32 v3, v39, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v148, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v146, v3
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
   ; GCN-NEXT:    v_add_f32_e32 v3, v34, v3
-  ; GCN-NEXT:    v_add_f32_e32 v3, v150, v3
+  ; GCN-NEXT:    v_add_f32_e32 v3, v155, v3
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v10
   ; GCN-NEXT:    v_cvt_f16_f32_e32 v11, v2
   ; GCN-NEXT:    v_add_f32_e32 v3, v38, v3
@@ -1129,17 +1129,18 @@
   ; GCN-NEXT:    v_add_f32_e32 v4, v10, v0
   ; GCN-NEXT:    ds_bpermute_b32 v5, v133, v4
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1152
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v140 offset:1152
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_add_f32_e32 v2, v4, v5
   ; GCN-NEXT:    ds_bpermute_b32 v3, v133, v2
+  ; GCN-NEXT:    ; implicit-def: $vgpr4
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[8:9], v[96:111]
+  ; GCN-NEXT:    v_mov_b32_e32 v0, v4
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[6:7]
-  ; GCN-NEXT:    ; implicit-def: $vgpr4
-  ; GCN-NEXT:    v_fmac_f32_e32 v0, v4, v48
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v57 offset:1728
+  ; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, v2, s[6:7]
+  ; GCN-NEXT:    v_fmac_f32_e32 v1, v0, v48
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v140 offset:1728
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0887fdf0844b0..be97a1e82fcf2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -10,25 +10,24 @@
   ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
   ; GCN-NEXT:    v_readfirstlane_b32 s20, v2
   ; GCN-NEXT:    ; implicit-def: $sgpr4
-  ; GCN-NEXT:    ; implicit-def: $vgpr3
+  ; GCN-NEXT:    ; implicit-def: $vgpr64
   ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GCN-NEXT:    ; implicit-def: $vgpr50
+  ; GCN-NEXT:    ; implicit-def: $vgpr76
   ; GCN-NEXT:    ; implicit-def: $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; implicit-def: $vgpr49
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
-  ; GCN-NEXT:    ; implicit-def: $vgpr51
-  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
-  ; GCN-NEXT:    ; implicit-def: $vgpr76
+  ; GCN-NEXT:    ; implicit-def: $vgpr50
   ; GCN-NEXT:    ; implicit-def: $vgpr77
   ; GCN-NEXT:    ; implicit-def: $vgpr78
   ; GCN-NEXT:    ; implicit-def: $vgpr79
   ; GCN-NEXT:    ; implicit-def: $vgpr80
-  ; GCN-NEXT:    ; implicit-def: $vgpr91
+  ; GCN-NEXT:    ; implicit-def: $vgpr81
+  ; GCN-NEXT:    ; implicit-def: $vgpr103
   ; GCN-NEXT:    ; kill: killed $sgpr16_sgpr17_sgpr18_sgpr19
   ; GCN-NEXT:    ; iglp_opt mask(0x00000002)
   ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v3
+  ; GCN-NEXT:    v_lshl_add_u32 v2, s20, 4, v64
   ; GCN-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s4, v2, v[0:1]
   ; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[0:3], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -36,8 +35,9 @@
   ; GCN-NEXT:    s_lshl_b32 s4, s20, 7
   ; GCN-NEXT:    ; implicit-def: $vgpr5
   ; GCN-NEXT:    v_add_lshl_u32 v48, v5, s4, 1
-  ; GCN-NEXT:    v_add_u32_e32 v76, s20, v76
-  ; GCN-NEXT:    v_and_b32_e32 v76, 0x1fffffff, v76
+  ; GCN-NEXT:    ; implicit-def: $vgpr62_vgpr63_vgpr64_vgpr65
+  ; GCN-NEXT:    v_add_u32_e32 v77, s20, v77
+  ; GCN-NEXT:    v_and_b32_e32 v77, 0x1fffffff, v77
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    ds_write_b128 v48, v[0:3]
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -48,8 +48,8 @@
   ; GCN-NEXT:    ; implicit-def: $vgpr1
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   ; GCN-NEXT:    ; implicit-def: $sgpr6
-  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v50
-  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v50
+  ; GCN-NEXT:    v_add_u32_e32 v0, v0, v76
+  ; GCN-NEXT:    v_add_u32_e32 v1, v1, v76
   ; GCN-NEXT:    buffer_load_dwordx2 v[72:73], v0, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
@@ -68,22 +68,22 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[36:37], v[40:41], 0
   ; GCN-NEXT:    ; kill: killed $vgpr1
   ; GCN-NEXT:    ; kill: killed $vgpr0
-  ; GCN-NEXT:    v_mul_lo_u32 v76, v76, s6
-  ; GCN-NEXT:    v_add_lshl_u32 v76, v77, v76, 1
-  ; GCN-NEXT:    v_lshl_add_u32 v77, v78, 1, v76
-  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    v_mul_lo_u32 v77, v77, s6
+  ; GCN-NEXT:    v_add_lshl_u32 v77, v78, v77, 1
   ; GCN-NEXT:    v_lshl_add_u32 v78, v79, 1, v77
+  ; GCN-NEXT:    ; implicit-def: $sgpr5
+  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
   ; GCN-NEXT:    ; implicit-def: $sgpr2
   ; GCN-NEXT:    ; implicit-def: $sgpr3
-  ; GCN-NEXT:    v_lshl_add_u32 v79, v80, 1, v78
+  ; GCN-NEXT:    v_lshl_add_u32 v80, v81, 1, v79
   ; GCN-NEXT:    ; implicit-def: $sgpr0_sgpr1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[44:45], v[40:41], 0
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[38:39], v[42:43], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[36:39], v51
+  ; GCN-NEXT:    ds_read_b128 v[36:39], v50
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[46:47], v[42:43], v[0:15]
-  ; GCN-NEXT:    ds_read_b128 v[44:47], v51 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[44:47], v50 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ; implicit-def: $vgpr40_vgpr41_vgpr42_vgpr43
@@ -107,20 +107,20 @@
   ; GCN-NEXT:    ds_read_b128 v[40:43], v49 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v51
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v50
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[32:33], v[36:37], v[16:31]
   ; GCN-NEXT:    ; implicit-def: $vgpr32
   ; GCN-NEXT:    ; implicit-def: $vgpr33
-  ; GCN-NEXT:    v_add_u32_e32 v82, v32, v50
-  ; GCN-NEXT:    v_add_u32_e32 v83, v33, v50
-  ; GCN-NEXT:    ; kill: killed $vgpr82
+  ; GCN-NEXT:    v_add_u32_e32 v83, v32, v76
+  ; GCN-NEXT:    v_add_u32_e32 v76, v33, v76
   ; GCN-NEXT:    ; kill: killed $vgpr83
+  ; GCN-NEXT:    ; kill: killed $vgpr76
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[34:35], v[38:39], v[16:31]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[40:41], v[36:37], v[0:15]
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[68:69], v[62:63], v[16:31]
-  ; GCN-NEXT:    ds_read_b128 v[66:69], v51 offset:512
+  ; GCN-NEXT:    ds_read_b128 v[66:69], v50 offset:512
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
@@ -131,20 +131,20 @@
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[66:67], v[62:63], v[0:15]
   ; GCN-NEXT:    ; implicit-def: $vgpr66
   ; GCN-NEXT:    ; implicit-def: $vgpr67
-  ; GCN-NEXT:    v_max_f32_e32 v81, v67, v67
+  ; GCN-NEXT:    v_max_f32_e32 v82, v67, v67
   ; GCN-NEXT:    ; implicit-def: $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[16:31], v[70:71], v[64:65], v[16:31]
   ; GCN-NEXT:    v_perm_b32 v70, v74, v72, s2
   ; GCN-NEXT:    v_perm_b32 v71, v74, v72, s3
   ; GCN-NEXT:    v_perm_b32 v72, v75, v73, s2
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v76, v70
+  ; GCN-NEXT:    ds_write_b32 v77, v70
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v77, v71
+  ; GCN-NEXT:    ds_write_b32 v78, v71
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v72
+  ; GCN-NEXT:    ds_write_b32 v79, v72
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v20
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[0:15], v[68:69], v[64:65], v[0:15]
   ; GCN-NEXT:    v_mul_f32_e32 v64, s4, v16
@@ -152,11 +152,11 @@
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v18
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v19
   ; GCN-NEXT:    v_max3_f32 v64, v64, s5, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v21
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v22
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v23
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v24
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v25
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
@@ -166,12 +166,12 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v28
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v29
   ; GCN-NEXT:    v_max3_f32 v64, v64, v65, v68
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v30
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v30
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v31
   ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v0
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v1
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v80, v84
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v81, v84
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v2
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v3
   ; GCN-NEXT:    v_max3_f32 v64, v64, v85, v86
@@ -179,315 +179,315 @@
   ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v5
   ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v65
   ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v6
-  ; GCN-NEXT:    v_mul_f32_e32 v80, s4, v7
+  ; GCN-NEXT:    v_mul_f32_e32 v81, s4, v7
   ; GCN-NEXT:    v_max3_f32 v64, v64, v68, v69
   ; GCN-NEXT:    v_mul_f32_e32 v84, s4, v8
   ; GCN-NEXT:    v_mul_f32_e32 v85, s4, v9
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v80
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v74, v81
   ; GCN-NEXT:    v_mul_f32_e32 v86, s4, v10
   ; GCN-NEXT:    v_mul_f32_e32 v65, s4, v11
   ; GCN-NEXT:    v_max3_f32 v64, v64, v84, v85
   ; GCN-NEXT:    v_mul_f32_e32 v87, s4, v12
   ; GCN-NEXT:    v_mul_f32_e32 v68, s4, v13
   ; GCN-NEXT:    v_max3_f32 v64, v64, v86, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
-  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
   ; GCN-NEXT:    v_max3_f32 v64, v64, v87, v68
-  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
-  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
   ; GCN-NEXT:    v_perm_b32 v68, v75, v73, s3
+  ; GCN-NEXT:    v_mul_f32_e32 v69, s4, v14
+  ; GCN-NEXT:    v_mul_f32_e32 v74, s4, v15
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v68
-  ; GCN-NEXT:    ; implicit-def: $vgpr84
-  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
-  ; GCN-NEXT:    v_max_f32_e32 v70, v64, v65
+  ; GCN-NEXT:    ds_write_b32 v80, v68
+  ; GCN-NEXT:    v_max3_f32 v64, v64, v69, v74
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    buffer_load_dwordx2 v[64:65], v82, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    buffer_load_dwordx2 v[68:69], v83, s[16:19], 0 offen sc0 sc1
+  ; GCN-NEXT:    buffer_load_dwordx2 v[70:71], v76, s[16:19], 0 offen sc0 sc1
   ; GCN-NEXT:    s_waitcnt vmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_bpermute_b32 v71, v66, v70
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
+  ; GCN-NEXT:    ; implicit-def: $vgpr87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v70, v71, v70, s[0:1]
-  ; GCN-NEXT:    v_max_f32_e32 v70, v70, v70
-  ; GCN-NEXT:    v_max_f32_e32 v72, v81, v70
-  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v72
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v72
-  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v72
+  ; GCN-NEXT:    v_max_f32_e32 v65, v65, v65
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v65
+  ; GCN-NEXT:    ds_bpermute_b32 v65, v66, v64
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_cndmask_b32_e64 v64, v65, v64, s[0:1]
+  ; GCN-NEXT:    v_max_f32_e32 v64, v64, v64
+  ; GCN-NEXT:    v_max_f32_e32 v65, v82, v64
+  ; GCN-NEXT:    v_fma_f32 v16, s4, v16, -v65
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v65
+  ; GCN-NEXT:    v_fma_f32 v18, s4, v18, -v65
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v19, -v65
   ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
   ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v18
   ; GCN-NEXT:    v_mul_f32_e32 v19, 0x3fb8aa3b, v19
-  ; GCN-NEXT:    v_fma_f32 v17, s4, v17, -v72
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v72
-  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v72
-  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v72
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v73, v16
-  ; GCN-NEXT:    v_exp_f32_e32 v74, v18
-  ; GCN-NEXT:    v_exp_f32_e32 v75, v19
+  ; GCN-NEXT:    v_fma_f32 v20, s4, v20, -v65
+  ; GCN-NEXT:    v_fma_f32 v21, s4, v21, -v65
+  ; GCN-NEXT:    v_fma_f32 v22, s4, v22, -v65
+  ; GCN-NEXT:    v_fma_f32 v23, s4, v23, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v16
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v17
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v18
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v19
   ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v20
   ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v21
   ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v80, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v73
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v24, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v81, v21
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v74
-  ; GCN-NEXT:    v_fma_f32 v20, s4, v25, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v82, v22
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v75
-  ; GCN-NEXT:    v_mul_f32_e32 v17, 0x3fb8aa3b, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v71, v21, v22
-  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_sub_f32_e32 v24, v67, v72
-  ; GCN-NEXT:    v_exp_f32_e32 v83, v23
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v16, v72
+  ; GCN-NEXT:    v_fma_f32 v17, s4, v24, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v73
+  ; GCN-NEXT:    v_fma_f32 v19, s4, v25, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v21
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v81
+  ; GCN-NEXT:    v_fma_f32 v26, s4, v26, -v65
   ; GCN-NEXT:    v_exp_f32_e32 v85, v22
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v17
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v20
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v17
-  ; GCN-NEXT:    v_fma_f32 v87, s4, v29, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v88, v23
-  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v72
-  ; GCN-NEXT:    v_pack_b32_f16 v70, v16, v19
-  ; GCN-NEXT:    ds_read_b128 v[18:21], v84
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v82
+  ; GCN-NEXT:    v_pack_b32_f16 v24, v16, v18
+  ; GCN-NEXT:    v_sub_f32_e32 v22, v67, v65
+  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
+  ; GCN-NEXT:    v_pack_b32_f16 v25, v20, v21
+  ; GCN-NEXT:    v_mul_f32_e32 v20, 0x3fb8aa3b, v17
+  ; GCN-NEXT:    v_mul_f32_e32 v21, 0x3fb8aa3b, v19
+  ; GCN-NEXT:    ds_read_b128 v[16:19], v87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_exp_f32_e32 v16, v24
-  ; GCN-NEXT:    ds_read_b128 v[22:25], v84 offset:576
+  ; GCN-NEXT:    v_mul_f32_e32 v22, 0x3fb8aa3b, v22
+  ; GCN-NEXT:    v_fma_f32 v67, s4, v27, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v86, v23
+  ; GCN-NEXT:    v_exp_f32_e32 v64, v22
+  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
+  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[16:17], v[24:25], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v16, 0, v72
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v83
+  ; GCN-NEXT:    v_fma_f32 v88, s4, v28, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v20
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v84
+  ; GCN-NEXT:    v_fma_f32 v91, s4, v29, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v21
+  ; GCN-NEXT:    ds_read_b128 v[20:23], v87 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[54:55], v[54:55], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[56:57], v[56:57], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[58:59], v[58:59], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[60:61], v[60:61], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v18, 0, v73
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v89, v83
-  ; GCN-NEXT:    v_fma_f32 v73, s4, v28, -v72
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v80
-  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v72
-  ; GCN-NEXT:    v_perm_b32 v90, v69, v65, s2
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v17, v18
-  ; GCN-NEXT:    v_mul_f32_e32 v18, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v81
-  ; GCN-NEXT:    v_fma_f32 v23, s4, v30, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v30, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v22, v82
-  ; GCN-NEXT:    v_fma_f32 v18, s4, v31, -v72
-  ; GCN-NEXT:    v_perm_b32 v31, v68, v64, s2
-  ; GCN-NEXT:    v_perm_b32 v64, v68, v64, s3
-  ; GCN-NEXT:    v_perm_b32 v65, v69, v65, s3
-  ; GCN-NEXT:    ds_read_b128 v[26:29], v91
+  ; GCN-NEXT:    v_pk_mul_f32 v[36:37], v[36:37], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[38:39], v[38:39], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[40:41], v[40:41], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[42:43], v[42:43], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[44:45], v[44:45], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_pk_mul_f32 v[46:47], v[46:47], v[64:65] op_sel_hi:[1,0]
+  ; GCN-NEXT:    v_perm_b32 v99, v70, v68, s2
+  ; GCN-NEXT:    v_perm_b32 v100, v70, v68, s3
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[24:25], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v93, v73, v16
+  ; GCN-NEXT:    v_mul_f32_e32 v16, 0x3fb8aa3b, v26
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v94, v85
+  ; GCN-NEXT:    v_fma_f32 v95, s4, v30, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v96, v16
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v97, v86
+  ; GCN-NEXT:    v_fma_f32 v98, s4, v31, -v65
+  ; GCN-NEXT:    v_perm_b32 v101, v71, v69, s2
+  ; GCN-NEXT:    v_perm_b32 v102, v71, v69, s3
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    ds_read_b128 v[68:71], v91 offset:576
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
-  ; GCN-NEXT:    ds_write_b32 v76, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v31, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_exp_f32_e32 v31, v31
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v18
-  ; GCN-NEXT:    v_pack_b32_f16 v18, v19, v86
-  ; GCN-NEXT:    v_pack_b32_f16 v19, v22, v89
+  ; GCN-NEXT:    ds_write_b32 v77, v99
+  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v76, v76, v90
+  ; GCN-NEXT:    v_pack_b32_f16 v77, v94, v97
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v77, v64
+  ; GCN-NEXT:    ds_write_b32 v78, v100
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v78, v90
+  ; GCN-NEXT:    ds_write_b32 v79, v101
+  ; GCN-NEXT:    v_mul_f32_e32 v78, 0x3fb8aa3b, v88
+  ; GCN-NEXT:    v_mul_f32_e32 v79, 0x3fb8aa3b, v91
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v81, v81, v93
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v90, v89
+  ; GCN-NEXT:    v_fma_f32 v0, s4, v0, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v91, v78
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v78, v92
+  ; GCN-NEXT:    v_fma_f32 v1, s4, v1, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v93, v79
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[76:77], v[32:47]
   ; GCN-NEXT:    buffer_wbl2 sc0 sc1
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_write_b32 v79, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v64, 0x3fb8aa3b, v73
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v87
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v74, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v20, v85
-  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v22, v64
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v21, v88
-  ; GCN-NEXT:    v_exp_f32_e32 v64, v65
-  ; GCN-NEXT:    v_mul_f32_e32 v23, 0x3fb8aa3b, v23
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v75, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v30
-  ; GCN-NEXT:    v_fma_f32 v24, s4, v3, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v23, v23
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v31
+  ; GCN-NEXT:    ds_write_b32 v80, v102
+  ; GCN-NEXT:    v_mul_f32_e32 v80, 0x3fb8aa3b, v95
+  ; GCN-NEXT:    v_add_f32_e32 v76, v82, v81
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v96
+  ; GCN-NEXT:    v_fma_f32 v2, s4, v2, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v80, v80
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v79, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v98
+  ; GCN-NEXT:    v_fma_f32 v81, s4, v3, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v82, v88
   ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
-  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v20, v21
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v18, v19
-  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v25, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v80, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v18, v22
-  ; GCN-NEXT:    v_fma_f32 v26, s4, v4, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v27, v3
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v64
-  ; GCN-NEXT:    v_fma_f32 v67, s4, v5, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_mul_f32_e32 v88, 0x3fb8aa3b, v1
+  ; GCN-NEXT:    v_pack_b32_f16 v0, v90, v78
+  ; GCN-NEXT:    v_pack_b32_f16 v1, v77, v79
   ; GCN-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v2
-  ; GCN-NEXT:    v_add_f32_e32 v17, v81, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v23
-  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v68, v2
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v19, v25
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+  ; GCN-NEXT:    ; implicit-def: $sgpr2
+  ; GCN-NEXT:    s_nop 0
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[0:1], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v83, v76
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v91
+  ; GCN-NEXT:    v_fma_f32 v83, s4, v4, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v90, v3
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v93
+  ; GCN-NEXT:    v_fma_f32 v94, s4, v5, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v88, v88
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[0:1], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v84, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v80
+  ; GCN-NEXT:    v_fma_f32 v6, s4, v6, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v72, v2
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v82
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v69, v4
+  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v81
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v84
+  ; GCN-NEXT:    ds_read_b128 v[0:3], v87
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v18, v4
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v19
-  ; GCN-NEXT:    v_exp_f32_e32 v24, v24
-  ; GCN-NEXT:    ds_read_b128 v[18:21], v84 offset:576
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v73
+  ; GCN-NEXT:    v_fma_f32 v7, s4, v7, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v73, v69
+  ; GCN-NEXT:    ds_read_b128 v[76:79], v87 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mul_f32_e32 v26, 0x3fb8aa3b, v26
-  ; GCN-NEXT:    v_mul_f32_e32 v67, 0x3fb8aa3b, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v82, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v27
-  ; GCN-NEXT:    v_exp_f32_e32 v26, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v29, v65
-  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v67, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v69, 0x3fb8aa3b, v83
+  ; GCN-NEXT:    v_mul_f32_e32 v81, 0x3fb8aa3b, v94
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[4:5], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v68, v85, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v70, v90
+  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v71, v69
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v88
+  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v81, v81
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[4:5], v[32:47]
   ; GCN-NEXT:    v_mul_f32_e32 v6, 0x3fb8aa3b, v6
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v17, v83, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v68
-  ; GCN-NEXT:    v_exp_f32_e32 v6, v6
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v24
+  ; GCN-NEXT:    v_add_f32_e32 v68, v86, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v72
+  ; GCN-NEXT:    v_fma_f32 v10, s4, v10, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v74, v6
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v6, v73
   ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v7
-  ; GCN-NEXT:    v_exp_f32_e32 v7, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v4, v28, v29
-  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v69
-  ; GCN-NEXT:    ; implicit-def: $sgpr2
-  ; GCN-NEXT:    s_nop 1
+  ; GCN-NEXT:    v_fma_f32 v75, s4, v11, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v83, v7
+  ; GCN-NEXT:    v_pack_b32_f16 v4, v70, v69
+  ; GCN-NEXT:    v_pack_b32_f16 v5, v5, v6
+  ; GCN-NEXT:    v_mul_f32_e32 v7, 0x3fb8aa3b, v8
+  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v9
   ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v85, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v17, v26
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v28, v67
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v4, v88, v0
+  ; GCN-NEXT:    v_add_f32_e32 v0, v89, v68
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v71
+  ; GCN-NEXT:    v_fma_f32 v70, s4, v12, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v84, v7
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v81
+  ; GCN-NEXT:    v_fma_f32 v86, s4, v13, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v87, v8
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[76:77], v[4:5], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v76, v92, v0
   ; GCN-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v1, v6
-  ; GCN-NEXT:    v_exp_f32_e32 v10, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v1, v0
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v17, v28
-  ; GCN-NEXT:    s_nop 1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v30, v4
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_add_f32_e32 v0, v31, v2
-  ; GCN-NEXT:    v_add_f32_e32 v0, v22, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v64, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v23, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v25, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v27, v0
-  ; GCN-NEXT:    v_fma_f32 v8, s4, v8, -v72
-  ; GCN-NEXT:    v_add_f32_e32 v0, v65, v0
-  ; GCN-NEXT:    v_fma_f32 v9, s4, v9, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v8, 0x3fb8aa3b, v8
-  ; GCN-NEXT:    v_add_f32_e32 v0, v68, v0
-  ; GCN-NEXT:    v_fma_f32 v11, s4, v11, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v9, 0x3fb8aa3b, v9
-  ; GCN-NEXT:    v_fma_f32 v12, s4, v12, -v72
-  ; GCN-NEXT:    v_fma_f32 v13, s4, v13, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v8, v8
-  ; GCN-NEXT:    v_add_f32_e32 v0, v24, v0
-  ; GCN-NEXT:    v_fma_f32 v5, s4, v14, -v72
-  ; GCN-NEXT:    v_exp_f32_e32 v9, v9
-  ; GCN-NEXT:    v_add_f32_e32 v0, v26, v0
-  ; GCN-NEXT:    v_add_f32_e32 v0, v67, v0
-  ; GCN-NEXT:    v_fma_f32 v14, s4, v15, -v72
-  ; GCN-NEXT:    v_mul_f32_e32 v11, 0x3fb8aa3b, v11
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v12
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v5
-  ; GCN-NEXT:    v_add_f32_e32 v0, v6, v0
-  ; GCN-NEXT:    v_exp_f32_e32 v11, v11
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v8
-  ; GCN-NEXT:    v_exp_f32_e32 v12, v3
-  ; GCN-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v13
-  ; GCN-NEXT:    v_exp_f32_e32 v17, v1
-  ; GCN-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v14
-  ; GCN-NEXT:    v_add_f32_e32 v0, v7, v0
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v13, v9
-  ; GCN-NEXT:    v_exp_f32_e32 v15, v3
-  ; GCN-NEXT:    v_exp_f32_e32 v18, v1
-  ; GCN-NEXT:    v_add_f32_e32 v6, v8, v0
-  ; GCN-NEXT:    ds_read_b128 v[0:3], v91
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v69, v74
+  ; GCN-NEXT:    v_fma_f32 v77, s4, v14, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v89, v0
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v92, v83
+  ; GCN-NEXT:    v_pack_b32_f16 v68, v68, v85
+  ; GCN-NEXT:    v_mul_f32_e32 v75, 0x3fb8aa3b, v75
+  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v70
+  ; GCN-NEXT:    v_pack_b32_f16 v69, v69, v92
+  ; GCN-NEXT:    v_fma_f32 v65, s4, v15, -v65
+  ; GCN-NEXT:    v_exp_f32_e32 v75, v75
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_add_f32_e32 v76, v96, v76
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v85, v84
+  ; GCN-NEXT:    v_exp_f32_e32 v92, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v70, 0x3fb8aa3b, v86
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v86, v87
+  ; GCN-NEXT:    v_exp_f32_e32 v94, v70
+  ; GCN-NEXT:    v_mul_f32_e32 v65, 0x3fb8aa3b, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[78:79], v[68:69], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v67, v67, v76
+  ; GCN-NEXT:    v_add_f32_e32 v67, v91, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v93, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v80, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v82, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v90, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v88, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v72, v67
+  ; GCN-NEXT:    v_mul_f32_e32 v68, 0x3fb8aa3b, v77
+  ; GCN-NEXT:    v_add_f32_e32 v67, v73, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v76, v89
+  ; GCN-NEXT:    v_exp_f32_e32 v78, v68
+  ; GCN-NEXT:    v_add_f32_e32 v67, v71, v67
+  ; GCN-NEXT:    ds_read_b128 v[68:71], v103
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v5, v10
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v14, v11
-  ; GCN-NEXT:    v_add_f32_e32 v6, v9, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v8, v4, v13
-  ; GCN-NEXT:    v_add_f32_e32 v6, v10, v6
-  ; GCN-NEXT:    v_pack_b32_f16 v9, v5, v14
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v7, v18
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v10, v15
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v0, v17
-  ; GCN-NEXT:    v_cvt_f16_f32_e32 v4, v12
-  ; GCN-NEXT:    v_add_f32_e32 v6, v11, v6
-  ; GCN-NEXT:    v_add_f32_e32 v6, v12, v6
-  ; GCN-NEXT:    v_add_f32_e32 v1, v15, v6
-  ; GCN-NEXT:    v_add_f32_e32 v11, v17, v1
-  ; GCN-NEXT:    v_pack_b32_f16 v1, v0, v7
-  ; GCN-NEXT:    v_pack_b32_f16 v0, v4, v10
-  ; GCN-NEXT:    ds_read_b128 v[4:7], v91 offset:576
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v77, v75
+  ; GCN-NEXT:    v_exp_f32_e32 v65, v65
+  ; GCN-NEXT:    v_add_f32_e32 v67, v81, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v74, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v77, v76, v77
+  ; GCN-NEXT:    v_pack_b32_f16 v76, v85, v86
+  ; GCN-NEXT:    v_add_f32_e32 v67, v83, v67
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v72, v65
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v73, v94
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[68:69], v[76:77], v[48:63]
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v68, v78
+  ; GCN-NEXT:    v_cvt_f16_f32_e32 v74, v92
+  ; GCN-NEXT:    v_add_f32_e32 v67, v84, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v87, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v89, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v75, v67
+  ; GCN-NEXT:    v_pack_b32_f16 v69, v68, v72
+  ; GCN-NEXT:    v_pack_b32_f16 v68, v74, v73
+  ; GCN-NEXT:    ds_read_b128 v[72:75], v103 offset:576
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
   ; GCN-NEXT:    buffer_inv sc0 sc1
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[4:5], v[8:9], v[32:47]
+  ; GCN-NEXT:    v_add_f32_e32 v67, v92, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v94, v67
+  ; GCN-NEXT:    v_add_f32_e32 v67, v78, v67
+  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
+  ; GCN-NEXT:    ds_bpermute_b32 v67, v66, v65
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[72:73], v[76:77], v[32:47]
+  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+  ; GCN-NEXT:    v_add_f32_e32 v65, v65, v67
+  ; GCN-NEXT:    ds_bpermute_b32 v66, v66, v65
   ; GCN-NEXT:    ;;#ASMSTART
   ; GCN-NEXT:    s_waitcnt vmcnt(8)
   ; GCN-NEXT:    ;;#ASMEND
-  ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
-  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
-  ; GCN-NEXT:    v_add_f32_e32 v2, v18, v11
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
-  ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_add_f32_e32 v2, v2, v3
-  ; GCN-NEXT:    ds_bpermute_b32 v3, v66, v2
+  ; GCN-NEXT:    v_mov_b32_e32 v67, 0
   ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-  ; GCN-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
-  ; GCN-NEXT:    v_fmac_f32_e32 v2, v4, v16
+  ; GCN-NEXT:    v_cndmask_b32_e64 v65, v66, v65, s[0:1]
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[48:63], v[70:71], v[68:69], v[48:63]
+  ; GCN-NEXT:    v_fmac_f32_e32 v65, v67, v64
+  ; GCN-NEXT:    v_mfma_f32_32x32x8_f16 v[32:47], v[74:75], v[68:69], v[32:47]
   ; GCN-NEXT:    s_endpgm
   attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index ff77d5ccbe312..b0b0272700f59 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -435,37 +435,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
 ; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f32_4x4x4bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 4
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -660,38 +660,38 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
 ; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX90A-VGPR:       ; %bb.0: ; %bb
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f32_16x16x16bf16_1k v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 7
 ; GFX90A-VGPR-NEXT:    s_nop 2
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v5
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 2
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_bf16 v[4:7], v[4:5], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1338,27 +1338,27 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[18:19], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[8:15], v[16:17], v[18:19], v[8:15] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    s_nop 7
 ; GFX90A-VGPR-NEXT:    s_nop 7
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
@@ -1367,27 +1367,27 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 64
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[8:15], v[16:17], v[18:19], v[8:15] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> splat (double bitcast (i64 274877906944 to double)), i32 0, i32 0, i32 0)
@@ -1679,27 +1679,27 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[8:15], v[18:19], v[16:17], v[8:15]
 ; GFX90A-VGPR-NEXT:    s_nop 7
 ; GFX90A-VGPR-NEXT:    s_nop 7
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
@@ -1708,27 +1708,27 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[8:15], v[18:19], v[16:17], v[8:15]
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
@@ -1797,27 +1797,27 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[14:15], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[16:17], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[8:15], v[18:19], v[16:17], v[8:15]
 ; GFX90A-VGPR-NEXT:    s_nop 7
 ; GFX90A-VGPR-NEXT:    s_nop 7
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX90A-VGPR-NEXT:    s_endpgm
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
@@ -1826,27 +1826,27 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v18, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v19, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[8:15], v[18:19], v[16:17], v[8:15]
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[12:15], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[8:11], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index beda16c17a5c9..da92372b9a86b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -31,26 +31,26 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i3
 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
 
 define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -73,47 +73,26 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -135,7 +114,26 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -319,26 +317,26 @@ bb:
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -361,47 +359,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -423,7 +400,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -452,26 +448,26 @@ bb:
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -494,47 +490,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -556,7 +531,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -585,26 +579,26 @@ bb:
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -627,47 +621,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -689,7 +662,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -718,26 +710,26 @@ bb:
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
 ; GFX942-GISEL:       ; %bb.0: ; %bb
@@ -760,47 +752,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 4
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, 3
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; GFX950-VGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, 1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, 4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
 ; GFX950-GISEL:       ; %bb.0: ; %bb
@@ -822,7 +793,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
 ; GFX950-GISEL-NEXT:    s_nop 6
 ; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX950-GISEL-NEXT:    s_endpgm
-;
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT:    v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT:    s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
 ; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
@@ -1471,46 +1461,85 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
 ; GFX942-AGPRCD:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -1533,47 +1562,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
 ; GFX942-AGPRCD-NEXT:    s_nop 5
 ; GFX942-AGPRCD-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
 ; GFX942-AGPRCD-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_f16:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
 ; GFX950-AGPRCD:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -1604,66 +1592,125 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <4 x half> %a, <8 x half> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1701,7 +1748,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1739,67 +1785,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_f16:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1837,7 +1822,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_f16:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -1883,46 +1867,85 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX950-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s6
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
 ; GFX942-AGPRCD:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -1945,47 +1968,6 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
 ; GFX942-AGPRCD-NEXT:    s_nop 5
 ; GFX942-AGPRCD-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
 ; GFX942-AGPRCD-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_16x16x32_bf16:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v6, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v7, s6
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s6
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
 ; GFX950-AGPRCD:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
@@ -2016,66 +1998,125 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg, <4 x i16> %a, <8 x i16> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 2
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2113,7 +2154,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2151,67 +2191,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[24:25] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[24:25] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s24, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s24
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_f32_32x32x16_bf16:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2249,7 +2228,6 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[24:25] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[24:25]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_f32_32x32x16_bf16:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
@@ -2295,53 +2273,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2367,7 +2391,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -2398,54 +2421,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2471,7 +2446,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -2510,73 +2484,139 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 2
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 3
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2618,7 +2658,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2661,74 +2700,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_i8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2770,7 +2741,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_i8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2821,53 +2791,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2893,7 +2909,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -2924,54 +2939,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -2997,7 +2964,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3036,53 +3002,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3108,7 +3120,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3139,54 +3150,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3212,7 +3175,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3251,53 +3213,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3323,7 +3331,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3354,54 +3361,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3427,7 +3386,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3466,53 +3424,99 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 6
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    s_nop 5
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s10
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s11
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s12
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s13
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s4, s2
+; GFX950-GISEL-NEXT:    s_mov_b32 s5, s3
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s14
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT:    s_nop 6
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3538,7 +3542,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    s_nop 5
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3569,54 +3572,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    s_nop 5
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v10, s8
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v11, s9
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v1, s14
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s14, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[12:13], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s4, s2
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s5, s3
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v6, s14
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3642,7 +3597,6 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    s_nop 6
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
@@ -3681,73 +3635,139 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 2
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 3
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3789,7 +3809,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3832,74 +3851,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3941,7 +3892,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_bf8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -3992,73 +3942,139 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 2
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 3
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4100,7 +4116,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4143,74 +4158,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4252,7 +4199,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_bf8_fp8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4303,73 +4249,139 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 2
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 3
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4411,7 +4423,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4454,74 +4465,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4563,7 +4506,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_bf8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4614,73 +4556,139 @@ bb:
 }
 
 define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %arg, <2 x i32> %a, <4 x i32> %b, i32 %idx) #0 {
-; GFX942-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
-; GFX942-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX942-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX942-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX942-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX942-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-SDAG-NEXT:    s_endpgm
+; GFX942-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-SDAG:       ; %bb.0: ; %bb
+; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    s_nop 7
+; GFX942-SDAG-NEXT:    s_nop 2
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
+; GFX942-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX942-GISEL:       ; %bb.0: ; %bb
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX942-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX942-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX942-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-GISEL-NEXT:    s_nop 7
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX942-GISEL-NEXT:    s_endpgm
 ;
-; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
-; GFX942-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX942-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX942-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX942-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX942-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX942-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX942-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX942-VGPRCD-GISEL-NEXT:    s_endpgm
+; GFX950-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-SDAG:       ; %bb.0: ; %bb
+; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
+; GFX950-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s22
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-SDAG-NEXT:    s_nop 1
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    s_nop 7
+; GFX950-SDAG-NEXT:    s_nop 3
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[12:15], s[24:25] offset:48
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[8:11], s[24:25] offset:32
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[4:7], s[24:25] offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v17, v[0:3], s[24:25]
+; GFX950-SDAG-NEXT:    s_endpgm
 ;
+; GFX950-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
+; GFX950-GISEL:       ; %bb.0: ; %bb
+; GFX950-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
+; GFX950-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
+; GFX950-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GFX950-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
+; GFX950-GISEL-NEXT:    s_mov_b32 s20, s18
+; GFX950-GISEL-NEXT:    s_mov_b32 s21, s19
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX950-GISEL-NEXT:    s_nop 1
+; GFX950-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, 0
+; GFX950-GISEL-NEXT:    s_nop 7
+; GFX950-GISEL-NEXT:    s_nop 2
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
+; GFX950-GISEL-NEXT:    s_endpgm
 ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
 ; GFX942-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4722,7 +4730,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX942-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX942-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX942-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
 ; GFX942-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX942-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4765,74 +4772,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; GFX942-AGPRCD-GISEL-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; GFX942-AGPRCD-GISEL-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
-; GFX950-VGPRCD-SDAG:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x2c
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; GFX950-VGPRCD-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v20, s20
-; GFX950-VGPRCD-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v21, s21
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, s22
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
-; GFX950-VGPRCD-SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT:    s_nop 2
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-SDAG-NEXT:    s_endpgm
-;
-; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
-; GFX950-VGPRCD-GISEL:       ; %bb.0: ; %bb
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x2c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[24:25], s[4:5], 0x24
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx2 s[22:23], s[4:5], 0x3c
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dword s26, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GFX950-VGPRCD-GISEL-NEXT:    s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s20, s18
-; GFX950-VGPRCD-GISEL-NEXT:    s_mov_b32 s21, s19
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GFX950-VGPRCD-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT:    v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
-; GFX950-VGPRCD-GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 7
-; GFX950-VGPRCD-GISEL-NEXT:    s_nop 2
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[24:25]
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[24:25] offset:16
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[24:25] offset:32
-; GFX950-VGPRCD-GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[24:25] offset:48
-; GFX950-VGPRCD-GISEL-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
 ; GFX950-AGPRCD-SDAG:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4874,7 +4813,6 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GFX950-AGPRCD-SDAG-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
 ; GFX950-AGPRCD-SDAG-NEXT:    s_endpgm
-;
 ; GFX950-AGPRCD-GISEL-LABEL: test_smfmac_i32_32x32x32_fp8_fp8:
 ; GFX950-AGPRCD-GISEL:       ; %bb.0: ; %bb
 ; GFX950-AGPRCD-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
@@ -4928,5 +4866,9 @@ attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX942: {{.*}}
 ; GFX942-VGPRCD: {{.*}}
+; GFX942-VGPRCD-GISEL: {{.*}}
+; GFX942-VGPRCD-SDAG: {{.*}}
 ; GFX950: {{.*}}
 ; GFX950-VGPRCD: {{.*}}
+; GFX950-VGPRCD-GISEL: {{.*}}
+; GFX950-VGPRCD-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 284ced1727b7e..29342cbbe4419 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -271,28 +271,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; GCN-NEXT:    v_mov_b32_e32 v42, s22
 ; GCN-NEXT:    v_mov_b32_e32 v43, s23
+; GCN-NEXT:    v_mov_b32_e32 v32, s16
+; GCN-NEXT:    v_mov_b32_e32 v33, s17
+; GCN-NEXT:    v_mov_b32_e32 v34, s18
+; GCN-NEXT:    v_mov_b32_e32 v35, s19
 ; GCN-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    v_mov_b32_e32 v16, s16
-; GCN-NEXT:    v_mov_b32_e32 v17, s17
-; GCN-NEXT:    v_mov_b32_e32 v18, s18
-; GCN-NEXT:    v_mov_b32_e32 v19, s19
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-NEXT:    v_mov_b32_e32 v17, s13
-; GCN-NEXT:    v_mov_b32_e32 v18, s14
-; GCN-NEXT:    v_mov_b32_e32 v19, s15
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s12
+; GCN-NEXT:    v_mov_b32_e32 v33, s13
+; GCN-NEXT:    v_mov_b32_e32 v34, s14
+; GCN-NEXT:    v_mov_b32_e32 v35, s15
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s8
-; GCN-NEXT:    v_mov_b32_e32 v17, s9
-; GCN-NEXT:    v_mov_b32_e32 v18, s10
-; GCN-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s8
+; GCN-NEXT:    v_mov_b32_e32 v33, s9
+; GCN-NEXT:    v_mov_b32_e32 v34, s10
+; GCN-NEXT:    v_mov_b32_e32 v35, s11
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -334,28 +333,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; GCN-NEXT:    v_mov_b32_e32 v42, s22
 ; GCN-NEXT:    v_mov_b32_e32 v43, s23
+; GCN-NEXT:    v_mov_b32_e32 v32, s16
+; GCN-NEXT:    v_mov_b32_e32 v33, s17
+; GCN-NEXT:    v_mov_b32_e32 v34, s18
+; GCN-NEXT:    v_mov_b32_e32 v35, s19
 ; GCN-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 2
-; GCN-NEXT:    v_mov_b32_e32 v16, s16
-; GCN-NEXT:    v_mov_b32_e32 v17, s17
-; GCN-NEXT:    v_mov_b32_e32 v18, s18
-; GCN-NEXT:    v_mov_b32_e32 v19, s19
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s12
-; GCN-NEXT:    v_mov_b32_e32 v17, s13
-; GCN-NEXT:    v_mov_b32_e32 v18, s14
-; GCN-NEXT:    v_mov_b32_e32 v19, s15
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s12
+; GCN-NEXT:    v_mov_b32_e32 v33, s13
+; GCN-NEXT:    v_mov_b32_e32 v34, s14
+; GCN-NEXT:    v_mov_b32_e32 v35, s15
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mov_b32_e32 v16, s8
-; GCN-NEXT:    v_mov_b32_e32 v17, s9
-; GCN-NEXT:    v_mov_b32_e32 v18, s10
-; GCN-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s8
+; GCN-NEXT:    v_mov_b32_e32 v33, s9
+; GCN-NEXT:    v_mov_b32_e32 v34, s10
+; GCN-NEXT:    v_mov_b32_e32 v35, s11
+; GCN-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 078a043b94604..6fdecdf7087e1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1518,28 +1518,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; SDAG-NEXT:    v_mov_b32_e32 v42, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v43, s23
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1621,28 +1620,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; HEURRC-NEXT:    v_mov_b32_e32 v42, s22
 ; HEURRC-NEXT:    v_mov_b32_e32 v43, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 2
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -1678,28 +1676,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
 ; VGPRRC-NEXT:    v_mov_b32_e32 v42, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 2
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -1860,28 +1857,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; SDAG-NEXT:    v_mov_b32_e32 v42, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v43, s23
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -1963,28 +1959,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; HEURRC-NEXT:    v_mov_b32_e32 v42, s22
 ; HEURRC-NEXT:    v_mov_b32_e32 v43, s23
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 2
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -2020,28 +2015,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
 ; VGPRRC-NEXT:    v_mov_b32_e32 v42, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 2
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v44, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -3205,19 +3199,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], 16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], 0
 ; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3238,14 +3230,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8:
@@ -3619,19 +3611,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], 16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], 0
 ; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    s_nop 3
+; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], 16
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[4:7], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], 0
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s16
@@ -3652,14 +3642,14 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags:
@@ -4178,33 +4168,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4288,33 +4277,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; HEURRC-NEXT:    s_nop 6
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4352,33 +4340,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT:    s_nop 6
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4555,33 +4542,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -4665,33 +4651,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
 ; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT:    s_nop 6
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s16
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s17
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s18
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s19
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s12
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s13
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s14
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s15
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s8
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s9
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s10
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s11
+; HEURRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
@@ -4729,33 +4714,32 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
 ; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT:    s_nop 6
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s16
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s17
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s18
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s19
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s12
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s13
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s14
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s15
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s8
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s9
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s10
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s11
+; VGPRRC-NEXT:    global_store_dwordx4 v40, v[32:35], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
new file mode 100644
index 0000000000000..271b36fad2bb4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.hint.hazard.barrier.gfx942.mir
@@ -0,0 +1,1292 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=false %s -o - | FileCheck -check-prefix=GFX942_WITHOUT %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=machine-scheduler -stop-after=virtregrewriter,2 -amdgpu-avoid-hazard-hint-for-mfma=true %s -o - | FileCheck -check-prefix=GFX942_WITH %s
+
+--- |
+  target triple = "amdgcn-amd-amdhsa"
+
+  define amdgpu_kernel void @test_software_pipelining() #0 {
+    bb.0:
+      ret void
+  }
+
+  attributes #0 = {nounwind "amdgpu-waves-per-eu"="2"  "amdgpu-agpr-alloc"="0" "frame-pointer"="none"}
+
+...
+---
+name:            test_software_pipelining
+body:             |
+  bb.0:
+    ; GFX942_WITHOUT-LABEL: name: test_software_pipelining
+    ; GFX942_WITHOUT: renamable $vgpr115 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr109 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr110 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr108 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr111 = V_ADD_U32_e32 4096, $vgpr100, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr101 = V_ADD_U32_e32 $vgpr76, killed $vgpr52, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr112 = V_ADD_U32_e32 4096, $vgpr101, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr112, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr80_vgpr81, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 renamable $vgpr108, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr54_vgpr55, $vgpr82_vgpr83, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr52_vgpr53, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr54_vgpr55, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr80_vgpr81, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr76, killed $vgpr0, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr82_vgpr83, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr92_vgpr93, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr94_vgpr95, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = DS_READ_B128_gfx9 renamable $vgpr108, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr80_vgpr81, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr82_vgpr83, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr108, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr80_vgpr81, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr72_vgpr73_vgpr74_vgpr75, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr66_vgpr67, $vgpr82_vgpr83, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr64_vgpr65, $vgpr92_vgpr93, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr66_vgpr67, $vgpr94_vgpr95, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = DS_READ_B128_gfx9 renamable $vgpr108, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr80_vgpr81, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr108, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr100, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr50_vgpr51, $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr48_vgpr49, $vgpr92_vgpr93, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr50_vgpr51, $vgpr94_vgpr95, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr110, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr101, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, killed $vgpr82_vgpr83, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr92_vgpr93, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, killed $vgpr94_vgpr95, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr120 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr104_vgpr105, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr106_vgpr107, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr104_vgpr105, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr114 = V_ADD_U32_e32 $vgpr115, killed $vgpr16, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr104_vgpr105, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr113 = V_ADD_U32_e32 $vgpr115, killed $vgpr20, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr106_vgpr107, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr102_vgpr103, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr104_vgpr105, killed $vgpr96_vgpr97_vgpr98_vgpr99, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr78_vgpr79, $vgpr106_vgpr107, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr76_vgpr77, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr78_vgpr79, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr104_vgpr105, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr106_vgpr107, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr116_vgpr117, $vgpr100_vgpr101, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr118_vgpr119, $vgpr102_vgpr103, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr118_vgpr119_vgpr120_vgpr121 = DS_READ_B128_gfx9 killed renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr104_vgpr105, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr116 = V_ADD_U32_e32 $vgpr115, killed $vgpr56, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr106_vgpr107, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr100_vgpr101, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr115 = V_ADD_U32_e32 killed $vgpr115, killed $vgpr72, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr102_vgpr103, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr115, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279
+    ; GFX942_WITHOUT-NEXT: S_BARRIER
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr104_vgpr105, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr120_vgpr121, killed $vgpr106_vgpr107, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr118_vgpr119, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr120_vgpr121, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = DS_READ_B128_gfx9 renamable $vgpr108, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr111, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr104_vgpr105, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr106_vgpr107, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr108, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr104_vgpr105_vgpr106_vgpr107 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr112, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr108, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = DS_READ_B128_gfx9 renamable $vgpr108, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr108, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr114, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr70_vgpr71, $vgpr2_vgpr3, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr68_vgpr69, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr70_vgpr71, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = DS_READ_B128_gfx9 killed renamable $vgpr110, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr113, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr92 = IMPLICIT_DEF
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = DS_READ_B128_gfx9 renamable $vgpr92, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr92, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr74_vgpr75, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr72_vgpr73, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr74_vgpr75, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 renamable $vgpr92, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr82_vgpr83, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr80_vgpr81, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr82_vgpr83, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr116, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr92, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr115, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, $vgpr10_vgpr11, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = DS_READ_B128_gfx9 renamable $vgpr92, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr109, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr92, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr109, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr26_vgpr27, $vgpr10_vgpr11, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr24_vgpr25, $vgpr12_vgpr13, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr26_vgpr27, $vgpr14_vgpr15, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 killed renamable $vgpr92, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr8_vgpr9, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr10_vgpr11, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: S_WAITCNT 49279
+    ; GFX942_WITHOUT-NEXT: S_BARRIER
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr108, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITHOUT-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = DS_READ_B128_gfx9 killed renamable $vgpr108, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITHOUT-NEXT: SCHED_BARRIER 0
+    ; GFX942_WITHOUT-NEXT: S_ENDPGM 0
+    ;
+    ; GFX942_WITH-LABEL: name: test_software_pipelining
+    ; GFX942_WITH: renamable $vgpr96 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr121 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr122 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr52 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr120 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr123 = V_ADD_U32_e32 4096, $vgpr97, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr102 = V_ADD_U32_e32 $vgpr52, killed $vgpr0, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr124 = V_ADD_U32_e32 4096, $vgpr102, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr124, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = DS_READ_B128_gfx9 renamable $vgpr120, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr80_vgpr81, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr2_vgpr3, $vgpr82_vgpr83, killed $vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr0_vgpr1, $vgpr92_vgpr93, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr2_vgpr3, $vgpr94_vgpr95, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr80_vgpr81, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr0 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: dead renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr52, killed $vgpr0, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr6_vgpr7, $vgpr82_vgpr83, killed $vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr4_vgpr5, $vgpr92_vgpr93, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr6_vgpr7, $vgpr94_vgpr95, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr120, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr116_vgpr117_vgpr118_vgpr119 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr60_vgpr61_vgpr62_vgpr63, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr98_vgpr99_vgpr100_vgpr101 = DS_READ_B128_gfx9 renamable $vgpr120, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr80_vgpr81, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr108_vgpr109_vgpr110_vgpr111, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr108_vgpr109_vgpr110_vgpr111 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr82_vgpr83, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr92_vgpr93, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr94_vgpr95, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr80_vgpr81, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr8_vgpr9_vgpr10_vgpr11 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr97, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr82_vgpr83, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr92_vgpr93, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr100_vgpr101, $vgpr94_vgpr95, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = DS_READ_B128_gfx9 renamable $vgpr122, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr80_vgpr81, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr12_vgpr13_vgpr14_vgpr15 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr102, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, killed $vgpr82_vgpr83, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr92_vgpr93, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, killed $vgpr94_vgpr95, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr97 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = DS_READ_B128_gfx9 renamable $vgpr97, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr112_vgpr113, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = DS_READ_B128_gfx9 renamable $vgpr97, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr18_vgpr19, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr16_vgpr17, $vgpr100_vgpr101, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr18_vgpr19, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr112_vgpr113, killed $vgpr56_vgpr57_vgpr58_vgpr59, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr22_vgpr23, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr20_vgpr21, $vgpr100_vgpr101, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr22_vgpr23, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr112_vgpr113, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr126 = V_ADD_U32_e32 $vgpr96, killed $vgpr16, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr30_vgpr31, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr28_vgpr29, $vgpr100_vgpr101, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr30_vgpr31, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr104_vgpr105_vgpr106_vgpr107, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr20 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr125 = V_ADD_U32_e32 $vgpr96, killed $vgpr20, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr97, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr116_vgpr117_vgpr118_vgpr119, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr84_vgpr85_vgpr86_vgpr87, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = DS_READ_B128_gfx9 renamable $vgpr97, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr112_vgpr113, killed $vgpr108_vgpr109_vgpr110_vgpr111, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr104_vgpr105_vgpr106_vgpr107 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr104_vgpr105_vgpr106_vgpr107, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr114_vgpr115, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr100_vgpr101, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr102_vgpr103, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 killed renamable $vgpr97, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr112_vgpr113, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr104 = V_ADD_U32_e32 $vgpr96, killed $vgpr56, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr62_vgpr63, $vgpr114_vgpr115, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr60_vgpr61, $vgpr100_vgpr101, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr127 = V_ADD_U32_e32 killed $vgpr96, killed $vgpr60, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr62_vgpr63, $vgpr102_vgpr103, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr60_vgpr61_vgpr62_vgpr63 = BUFFER_LOAD_DWORDX4_OFFEN renamable $vgpr127, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: S_WAITCNT 49279
+    ; GFX942_WITH-NEXT: S_BARRIER
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr112_vgpr113, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, killed $vgpr114_vgpr115, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr100_vgpr101, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr92_vgpr93_vgpr94_vgpr95 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, killed $vgpr102_vgpr103, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_BARRIER 0
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr112_vgpr113_vgpr114_vgpr115 = DS_READ_B128_gfx9 renamable $vgpr120, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr48_vgpr49_vgpr50_vgpr51 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr44_vgpr45_vgpr46_vgpr47 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr0_vgpr1, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr96_vgpr97_vgpr98_vgpr99 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr123, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr52_vgpr53_vgpr54_vgpr55 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr114_vgpr115, $vgpr2_vgpr3, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr112_vgpr113, $vgpr4_vgpr5, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr114_vgpr115, $vgpr6_vgpr7, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr124, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr64_vgpr65_vgpr66_vgpr67 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr120, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr84_vgpr85_vgpr86_vgpr87, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr20_vgpr21_vgpr22_vgpr23, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr20_vgpr21_vgpr22_vgpr23 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr2_vgpr3, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr68_vgpr69_vgpr70_vgpr71 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr6_vgpr7, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr120, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr0_vgpr1, killed $vgpr88_vgpr89_vgpr90_vgpr91, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr126, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr76_vgpr77_vgpr78_vgpr79 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr2_vgpr3, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr4_vgpr5, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr72_vgpr73_vgpr74_vgpr75 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr6_vgpr7, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 killed renamable $vgpr122, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr0_vgpr1, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr106_vgpr107_vgpr108_vgpr109 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr125, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, killed $vgpr2_vgpr3, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr80_vgpr81_vgpr82_vgpr83 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr4_vgpr5, killed $vgpr92_vgpr93_vgpr94_vgpr95, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, killed $vgpr6_vgpr7, killed $vgpr80_vgpr81_vgpr82_vgpr83, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr105 = IMPLICIT_DEF
+    ; GFX942_WITH-NEXT: renamable $vgpr100_vgpr101_vgpr102_vgpr103 = DS_READ_B128_gfx9 renamable $vgpr105, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr106_vgpr107_vgpr108_vgpr109 = DS_READ_B128_gfx9 renamable $vgpr105, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr8_vgpr9, killed $vgpr48_vgpr49_vgpr50_vgpr51, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr96_vgpr97_vgpr98_vgpr99 = DS_READ_B128_gfx9 renamable $vgpr105, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr102_vgpr103, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr100_vgpr101, $vgpr12_vgpr13, killed $vgpr44_vgpr45_vgpr46_vgpr47, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr102_vgpr103, $vgpr14_vgpr15, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr8_vgpr9, killed $vgpr52_vgpr53_vgpr54_vgpr55, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr84_vgpr85_vgpr86_vgpr87 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr104, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr36_vgpr37_vgpr38_vgpr39 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr108_vgpr109, $vgpr10_vgpr11, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr106_vgpr107, $vgpr12_vgpr13, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr108_vgpr109, $vgpr14_vgpr15, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr84_vgpr85_vgpr86_vgpr87 = DS_READ_B128_gfx9 renamable $vgpr105, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr8_vgpr9, killed $vgpr32_vgpr33_vgpr34_vgpr35, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr88_vgpr89_vgpr90_vgpr91 = BUFFER_LOAD_DWORDX4_OFFEN killed renamable $vgpr127, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr28_vgpr29_vgpr30_vgpr31 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr98_vgpr99, $vgpr10_vgpr11, killed $vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr96_vgpr97, $vgpr12_vgpr13, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr24_vgpr25_vgpr26_vgpr27 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr98_vgpr99, $vgpr14_vgpr15, killed $vgpr24_vgpr25_vgpr26_vgpr27, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr88_vgpr89_vgpr90_vgpr91 = DS_READ_B128_gfx9 renamable $vgpr105, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr8_vgpr9, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 renamable $vgpr121, killed renamable $vgpr56_vgpr57_vgpr58_vgpr59, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr86_vgpr87, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr84_vgpr85, $vgpr12_vgpr13, killed $vgpr64_vgpr65_vgpr66_vgpr67, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr86_vgpr87, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr56_vgpr57_vgpr58_vgpr59 = DS_READ_B128_gfx9 renamable $vgpr105, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr8_vgpr9, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: DS_WRITE_B128_gfx9 killed renamable $vgpr121, killed renamable $vgpr60_vgpr61_vgpr62_vgpr63, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr90_vgpr91, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr88_vgpr89, $vgpr12_vgpr13, killed $vgpr68_vgpr69_vgpr70_vgpr71, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr90_vgpr91, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = DS_READ_B128_gfx9 killed renamable $vgpr105, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr8_vgpr9, killed $vgpr76_vgpr77_vgpr78_vgpr79, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr58_vgpr59, $vgpr10_vgpr11, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr56_vgpr57, $vgpr12_vgpr13, killed $vgpr72_vgpr73_vgpr74_vgpr75, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr16_vgpr17_vgpr18_vgpr19 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr58_vgpr59, $vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: S_WAITCNT 49279
+    ; GFX942_WITH-NEXT: S_BARRIER
+    ; GFX942_WITH-NEXT: dead renamable $vgpr44_vgpr45_vgpr46_vgpr47 = DS_READ_B128_gfx9 renamable $vgpr120, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr8_vgpr9, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr42_vgpr43, killed $vgpr10_vgpr11, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 $vgpr40_vgpr41, $vgpr12_vgpr13, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr0_vgpr1_vgpr2_vgpr3 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 killed $vgpr42_vgpr43, killed $vgpr14_vgpr15, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX942_WITH-NEXT: dead renamable $vgpr8_vgpr9_vgpr10_vgpr11 = DS_READ_B128_gfx9 killed renamable $vgpr120, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 512, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 32, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 8, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_GROUP_BARRIER 256, 1, 0
+    ; GFX942_WITH-NEXT: SCHED_BARRIER 0
+    ; GFX942_WITH-NEXT: S_ENDPGM 0
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vgpr_32 = IMPLICIT_DEF
+    %2:vgpr_32 = IMPLICIT_DEF
+    %3:vgpr_32 = IMPLICIT_DEF
+    %4:vgpr_32 = IMPLICIT_DEF
+    %5:sgpr_128 = IMPLICIT_DEF
+    %6:sgpr_128 = IMPLICIT_DEF
+    %7:vgpr_32 = IMPLICIT_DEF
+    %8:vreg_128_align2 = IMPLICIT_DEF
+    %9:vreg_128_align2 = IMPLICIT_DEF
+    %10:vreg_128_align2 = IMPLICIT_DEF
+    %11:vreg_128_align2 = IMPLICIT_DEF
+    %12:vreg_128_align2 = IMPLICIT_DEF
+    %13:vreg_128_align2 = IMPLICIT_DEF
+    %14:vreg_128_align2 = IMPLICIT_DEF
+    %15:vreg_128_align2 = IMPLICIT_DEF
+    %16:vreg_128_align2 = IMPLICIT_DEF
+    %17:vreg_128_align2 = IMPLICIT_DEF
+    %18:vreg_128_align2 = IMPLICIT_DEF
+    %19:vreg_128_align2 = IMPLICIT_DEF
+    %20:vreg_128_align2 = IMPLICIT_DEF
+    %21:vreg_128_align2 = IMPLICIT_DEF
+    %22:vreg_128_align2 = IMPLICIT_DEF
+    %23:vreg_128_align2 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = V_ADD_U32_e32 4096, %25:vgpr_32, implicit $exec
+    %27:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %7:vgpr_32, implicit $exec
+    %26:vgpr_32 = V_ADD_U32_e32 4096, %27:vgpr_32, implicit $exec
+    %28:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %29:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %31:vreg_128_align2 = IMPLICIT_DEF
+    %30:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %23:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %32:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    %33:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %30:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %34:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %22:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %35:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %31.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %34:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %37:vreg_128_align2 = IMPLICIT_DEF
+    %36:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %21:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %38:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    %39:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %36:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %40:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %20:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %41:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %37.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %40:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %42:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %19:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %43:vgpr_32 = IMPLICIT_DEF
+    %925:vgpr_32 = V_ADD_U32_e32 %3:vgpr_32, %43:vgpr_32, implicit $exec
+    %44:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %45:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %42:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %46:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %18:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %47:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %32.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %46:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %48:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    %49:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %17:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %50:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 2048, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %51:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %49:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %52:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %16:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %53:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %38.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %52:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %54:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    %55:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %15:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %56:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %56:vreg_128_align2, 16384, 0, implicit $exec :: (store (s128), addrspace 3)
+    %57:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %55:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %58:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %14:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %59:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %48.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %58:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %60:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    %61:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %13:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %62:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %62:vreg_128_align2, 20480, 0, implicit $exec :: (store (s128), addrspace 3)
+    %63:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %61:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %64:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %12:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %65:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %54.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %64:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %66:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    %67:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %11:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %68:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %25:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %69:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %67:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %70:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %10:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %71:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %60.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %70:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %72:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    %73:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %29.sub0_sub1:vreg_128_align2, %9:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %74:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %27:vgpr_32, %6:sgpr_128, 0, 3072, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %75:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %29.sub2_sub3:vreg_128_align2, %73:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %76:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub0_sub1:vreg_128_align2, %28.sub0_sub1:vreg_128_align2, %8:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %77:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %66.sub2_sub3:vreg_128_align2, %28.sub2_sub3:vreg_128_align2, %76:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %79:vgpr_32 = IMPLICIT_DEF
+    %78:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    %81:vreg_128_align2 = IMPLICIT_DEF
+    %80:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %33:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %82:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 4096, 0, implicit $exec :: (load (s128), addrspace 3)
+    %83:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %80:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %85:vreg_128_align2 = IMPLICIT_DEF
+    %84:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %35:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %86:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %72.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %84:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %87:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %39:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %88:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 6144, 0, implicit $exec :: (load (s128), addrspace 3)
+    %89:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %87:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %90:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %41:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %91:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %78.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %90:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %92:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %45:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %94:vgpr_32 = IMPLICIT_DEF
+    %93:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %94:vgpr_32, implicit $exec
+    %95:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %96:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %92:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %97:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %47:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %98:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %82.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %97:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %99:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 8192, 0, implicit $exec :: (load (s128), addrspace 3)
+    %100:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %51:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %102:vgpr_32 = IMPLICIT_DEF
+    %101:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %102:vgpr_32, implicit $exec
+    %103:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %104:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %100:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %105:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %53:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %106:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %88.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %105:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %107:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 10240, 0, implicit $exec :: (load (s128), addrspace 3)
+    %108:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %57:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %109:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %109:vreg_128_align2, 24576, 0, implicit $exec :: (store (s128), addrspace 3)
+    %110:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %108:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %111:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %59:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %112:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %99.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %111:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %113:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 12288, 0, implicit $exec :: (load (s128), addrspace 3)
+    %114:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %63:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %115:vreg_128_align2 = IMPLICIT_DEF
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %115:vreg_128_align2, 28672, 0, implicit $exec :: (store (s128), addrspace 3)
+    %116:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %114:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %117:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %65:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %118:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %107.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %117:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %119:vreg_128_align2 = DS_READ_B128_gfx9 %79:vgpr_32, 14336, 0, implicit $exec :: (load (s128), addrspace 3)
+    %120:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %69:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %122:vgpr_32 = IMPLICIT_DEF
+    %121:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %122:vgpr_32, implicit $exec
+    %123:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %124:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %120:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %125:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %71:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %127:vgpr_32 = IMPLICIT_DEF
+    %126:vgpr_32 = V_ADD_U32_e32 %0:vgpr_32, %127:vgpr_32, implicit $exec
+    %128:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %113.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %125:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %129:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 256, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    S_WAITCNT 49279
+    S_BARRIER
+    %130:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    %131:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %81.sub0_sub1:vreg_128_align2, %75:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %132:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %81.sub2_sub3:vreg_128_align2, %131:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %133:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub0_sub1:vreg_128_align2, %85.sub0_sub1:vreg_128_align2, %77:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %134:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %119.sub2_sub3:vreg_128_align2, %85.sub2_sub3:vreg_128_align2, %133:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %135:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_BARRIER 0
+    %136:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %83:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %137:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    %138:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %136:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %139:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %86:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %140:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %135.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %139:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %141:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %89:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %142:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    %143:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %141:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %144:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %91:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %145:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %130.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %144:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %146:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %96:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %147:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %146:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %148:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %98:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %149:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %137.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %148:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %150:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    %151:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %104:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %152:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %151:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %153:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %106:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %154:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %142.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %153:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %155:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    %156:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %110:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %95:vreg_128_align2, 0, 0, implicit $exec :: (store (s128), addrspace 3)
+    %157:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %156:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %158:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %112:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %159:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %150.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %158:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %160:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    %161:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %116:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %103:vreg_128_align2, 4096, 0, implicit $exec :: (store (s128), addrspace 3)
+    %162:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %161:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %163:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %118:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %164:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %155.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %163:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %165:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    %166:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %124:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %981:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %24:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %167:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %166:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %168:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %128:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %169:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %160.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %168:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %170:vreg_128_align2 = DS_READ_B128_gfx9 %2:vgpr_32, 16384, 0, implicit $exec :: (load (s128), addrspace 3)
+    %171:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %44.sub0_sub1:vreg_128_align2, %132:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %985:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %26:vgpr_32, %6:sgpr_128, 0, 1024, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %172:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %44.sub2_sub3:vreg_128_align2, %171:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %173:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub0_sub1:vreg_128_align2, %50.sub0_sub1:vreg_128_align2, %134:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %174:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %165.sub2_sub3:vreg_128_align2, %50.sub2_sub3:vreg_128_align2, %173:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %176:vgpr_32 = IMPLICIT_DEF
+    %175:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 18432, 0, implicit $exec :: (load (s128), addrspace 3)
+    %177:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %138:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %178:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 20480, 0, implicit $exec :: (load (s128), addrspace 3)
+    %179:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %177:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %180:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %140:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %962:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %170.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %180:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %182:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %143:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %183:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 22528, 0, implicit $exec :: (load (s128), addrspace 3)
+    %961:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %182:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %185:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %145:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %960:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %175.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %185:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %187:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %147:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %956:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %93:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %959:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %187:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %189:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %149:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %958:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %178.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %189:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %191:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 24576, 0, implicit $exec :: (load (s128), addrspace 3)
+    %192:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %152:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %962:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %101:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %957:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %192:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %194:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %154:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %956:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %183.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %194:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %196:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 26624, 0, implicit $exec :: (load (s128), addrspace 3)
+    %197:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %157:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %123:vreg_128_align2, 8192, 0, implicit $exec :: (store (s128), addrspace 3)
+    %955:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %197:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %199:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %159:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %954:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %191.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %199:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %201:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 28672, 0, implicit $exec :: (load (s128), addrspace 3)
+    %202:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %162:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    DS_WRITE_B128_gfx9 %1:vgpr_32, %129:vreg_128_align2, 12288, 0, implicit $exec :: (store (s128), addrspace 3)
+    %953:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %202:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %204:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %164:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %952:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %196.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %204:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %206:vreg_128_align2 = DS_READ_B128_gfx9 %176:vgpr_32, 30720, 0, implicit $exec :: (load (s128), addrspace 3)
+    %207:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %167:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %910:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %121:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    %951:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %207:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %209:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %169:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %950:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %201.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %209:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %911:vreg_128_align2 = BUFFER_LOAD_DWORDX4_OFFEN %126:vgpr_32, %5:sgpr_128, 0, 384, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+    S_WAITCNT 49279
+    S_BARRIER
+    %937:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 2048, 0, implicit $exec :: (load (s128), addrspace 3)
+    %211:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %68.sub0_sub1:vreg_128_align2, %172:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %949:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %68.sub2_sub3:vreg_128_align2, %211:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %213:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub0_sub1:vreg_128_align2, %74.sub0_sub1:vreg_128_align2, %174:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %948:vreg_128_align2 = contract V_MFMA_F32_16X16X32_FP8_FP8_vgprcd_e64 %206.sub2_sub3:vreg_128_align2, %74.sub2_sub3:vreg_128_align2, %213:vreg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %931:vreg_128_align2 = DS_READ_B128_gfx9 %4:vgpr_32, 0, 0, implicit $exec :: (load (s128), addrspace 3)
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 512, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 32, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 8, 1, 0
+    SCHED_GROUP_BARRIER 256, 1, 0
+    SCHED_BARRIER 0
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 78be949baabac..454f6e0da1151 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -809,17 +809,17 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -1174,18 +1174,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x4_f32 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -2035,21 +2035,21 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x4_16b_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -2433,21 +2433,21 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
 ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s4
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s6
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 6
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -3357,17 +3357,17 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_nop 4
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
@@ -3547,19 +3547,19 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1)
 ;
 ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
 ; GFX942-VGPR:       ; %bb.0:
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x41
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPR-NEXT:    v_mfma_i32_4x4x4_16b_i8 v[4:7], v4, v5, v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
   %in.1 = load <4 x i32>, ptr addrspace(1) %arg
   %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3)
@@ -4372,7 +4372,7 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4381,9 +4381,9 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v6, v[0:3], s[6:7]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -5094,12 +5094,12 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v0, v1, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> <float 1.0, float 2.0, float 1.0, float 1.0>, i32 0, i32 0, i32 0)
@@ -5628,6 +5628,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR:       ; %bb.0: ; %bb
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 1.0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v1
@@ -5656,40 +5658,38 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v27, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v28, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v29, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v30, v1
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v31, v1
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[30:31]
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v34, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[30:31], v[28:29]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[28:29], v[26:27]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[26:27], v[24:25]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[24:25], v[22:23]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[22:23], v[20:21]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[20:21], v[18:19]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[18:19], v[16:17]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[62:63], v[30:31]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v64, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[60:61], v[28:29]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[58:59], v[26:27]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[56:57], v[24:25]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[54:55], v[22:23]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[52:53], v[20:21]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[50:51], v[18:19]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[48:49], v[16:17]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[46:47], v[14:15]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[44:45], v[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[42:43], v[10:11]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[40:41], v[8:9]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[38:39], v[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[36:37], v[4:5]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[34:35], v[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[32:33], v[0:1]
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
+; GFX942-VGPR-NEXT:    v_mfma_f32_32x32x1_2b_f32 v[32:63], v0, v64, v[32:63]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[60:63], s[0:1] offset:112
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[56:59], s[0:1] offset:96
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[52:55], s[0:1] offset:80
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[48:51], s[0:1] offset:64
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[44:47], s[0:1] offset:48
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[40:43], s[0:1] offset:32
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[36:39], s[0:1] offset:16
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v1, v[32:35], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> <float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, i32 0, i32 0, i32 0)
@@ -5782,20 +5782,20 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX942-VGPR-NEXT:    v_lshlrev_b32_e32 v8, 4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -5891,19 +5891,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa
 ;
 ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
 ; GFX942-VGPR:       ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 1.0
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v0, 0x42f60000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, 2.0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, 2.0
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 0
-; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
+; GFX942-VGPR-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[4:7], v4, v5, v[0:3]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    s_nop 2
-; GFX942-VGPR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-VGPR-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1]
 ; GFX942-VGPR-NEXT:    s_endpgm
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 24af3fa5ff9b7..d12002eaba6f1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -5027,44 +5027,42 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[20:21], 48
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[36:37], 48
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[24:25], 16
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[38:39], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5072,6 +5070,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[36:37]
 ; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[38:39]
@@ -5089,28 +5090,33 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], 48
+; GISEL-NEXT:    s_nop 0
 ; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 0, i32 0, i32 0)
@@ -5123,71 +5129,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT:    v_mov_b64_e32 v[36:37], 48
+; SDAG-NEXT:    v_mov_b64_e32 v[38:39], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[40:41], 16
+; SDAG-NEXT:    v_mov_b64_e32 v[42:43], 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    v_mov_b32_e32 v20, s16
-; SDAG-NEXT:    v_mov_b32_e32 v21, s17
-; SDAG-NEXT:    v_mov_b32_e32 v22, s18
-; SDAG-NEXT:    v_mov_b32_e32 v23, s19
-; SDAG-NEXT:    v_mov_b32_e32 v24, s20
-; SDAG-NEXT:    v_mov_b32_e32 v25, s21
-; SDAG-NEXT:    v_mov_b32_e32 v26, s22
-; SDAG-NEXT:    v_mov_b32_e32 v27, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s12
+; SDAG-NEXT:    v_mov_b32_e32 v1, s13
+; SDAG-NEXT:    v_mov_b32_e32 v2, s14
+; SDAG-NEXT:    v_mov_b32_e32 v3, s15
+; SDAG-NEXT:    v_mov_b32_e32 v4, s16
+; SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; SDAG-NEXT:    v_mov_b32_e32 v6, s18
+; SDAG-NEXT:    v_mov_b32_e32 v7, s19
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
+; SDAG-NEXT:    v_mov_b32_e32 v9, s21
+; SDAG-NEXT:    v_mov_b32_e32 v10, s22
+; SDAG-NEXT:    v_mov_b32_e32 v11, s23
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v28, s24
-; SDAG-NEXT:    v_mov_b32_e32 v29, s25
-; SDAG-NEXT:    v_mov_b32_e32 v30, s26
-; SDAG-NEXT:    v_mov_b32_e32 v31, s27
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v13, s25
+; SDAG-NEXT:    v_mov_b32_e32 v14, s26
+; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[20:21], 48
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[0:7], v[8:15], v[16:31], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[22:23], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[24:25], 16
-; SDAG-NEXT:    v_mov_b32_e32 v16, s16
-; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mov_b32_e32 v18, s18
-; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[16:19], off sc0 sc1
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s16
+; SDAG-NEXT:    v_mov_b32_e32 v33, s17
+; SDAG-NEXT:    v_mov_b32_e32 v34, s18
+; SDAG-NEXT:    v_mov_b32_e32 v35, s19
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[16:19], off sc0 sc1
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mov_b32_e32 v32, s12
+; SDAG-NEXT:    v_mov_b32_e32 v33, s13
+; SDAG-NEXT:    v_mov_b32_e32 v34, s14
+; SDAG-NEXT:    v_mov_b32_e32 v35, s15
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[16:19], off sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s8
+; SDAG-NEXT:    v_mov_b32_e32 v33, s9
+; SDAG-NEXT:    v_mov_b32_e32 v34, s10
+; SDAG-NEXT:    v_mov_b32_e32 v35, s11
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[32:35], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[22:23], v[8:11], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[38:39], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[20:21], v[12:15], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[36:37], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[24:25], v[4:7], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[40:41], v[4:7], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -5195,53 +5202,52 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], 0
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], 16
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], 0
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], 16
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], 32
 ; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[36:37]
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[38:39]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[40:41]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[42:43]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[44:45]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[46:47]
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[48:49]
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[50:51]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], 48
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], 25, 42 op_sel_hi:[0,0,0] blgp:2
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[44:45]
 ; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
 ; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[10:11]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[8:9]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[0:7], v[8:15], v[16:31], 25, 42 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[42:43], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[46:47], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], 48
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[40:41], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[44:45], s[20:21]
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[32:35], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[36:39], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[40:43], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[44:47], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 3
-; GISEL-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
+; GISEL-NEXT:    s_nop 4
+; GISEL-NEXT:    global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[50:51], v[4:7], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[52:53], v[8:11], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
+; GISEL-NEXT:    global_store_dwordx4 v[54:55], v[12:15], off sc0 sc1
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 198cac5834d1f..a0ebfb9674151 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -71,9 +71,9 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-SDAG-NEXT:    s_nop 1
-; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
 ; GFX942-SDAG-NEXT:    s_nop 6
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32_vgprcd:
@@ -87,14 +87,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32_vgprcd(ptr addrspace(1) %ar
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, 4.0
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-GISEL-NEXT:    s_nop 1
-; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX942-GISEL-NEXT:    s_nop 5
-; GFX942-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-GISEL-NEXT:    v_mfma_f32_16x16x8_xf32 v[4:7], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT:    s_nop 6
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7]
 ; GFX942-GISEL-NEXT:    s_endpgm
 bb:
   %in.1 = load <4 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index b25fe8392a60e..ab9c558daff26 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -445,11 +445,13 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v26, v10
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s28
+; SDAG-NEXT:    v_mov_b32_e32 v15, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -458,37 +460,36 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[10:25], v[36:39], v[28:35], v26
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v0, v10
+; SDAG-NEXT:    v_mov_b32_e32 v1, v11
+; SDAG-NEXT:    v_mov_b32_e32 v2, v12
+; SDAG-NEXT:    v_mov_b32_e32 v3, v13
+; SDAG-NEXT:    v_mov_b32_e32 v4, v14
+; SDAG-NEXT:    v_mov_b32_e32 v5, v15
+; SDAG-NEXT:    v_mov_b32_e32 v6, v16
+; SDAG-NEXT:    v_mov_b32_e32 v7, v17
+; SDAG-NEXT:    v_mov_b32_e32 v8, v18
+; SDAG-NEXT:    v_mov_b32_e32 v9, v19
+; SDAG-NEXT:    v_mov_b32_e32 v10, v20
+; SDAG-NEXT:    v_mov_b32_e32 v11, v21
+; SDAG-NEXT:    v_mov_b32_e32 v12, v22
+; SDAG-NEXT:    v_mov_b32_e32 v13, v23
+; SDAG-NEXT:    v_mov_b32_e32 v14, v24
+; SDAG-NEXT:    v_mov_b32_e32 v15, v25
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
@@ -793,11 +794,13 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-NEXT:    v_mov_b32_e32 v37, s1
 ; GCN-NEXT:    v_mov_b32_e32 v38, s2
 ; GCN-NEXT:    v_mov_b32_e32 v39, s3
-; GCN-NEXT:    v_mov_b32_e32 v13, s25
-; GCN-NEXT:    v_mov_b32_e32 v14, s26
-; GCN-NEXT:    v_mov_b32_e32 v15, s27
-; GCN-NEXT:    v_mov_b32_e32 v16, s28
-; GCN-NEXT:    v_mov_b32_e32 v17, s29
+; GCN-NEXT:    v_mov_b32_e32 v26, v10
+; GCN-NEXT:    v_mov_b32_e32 v10, s24
+; GCN-NEXT:    v_mov_b32_e32 v11, s25
+; GCN-NEXT:    v_mov_b32_e32 v12, s26
+; GCN-NEXT:    v_mov_b32_e32 v13, s27
+; GCN-NEXT:    v_mov_b32_e32 v14, s28
+; GCN-NEXT:    v_mov_b32_e32 v15, s29
 ; GCN-NEXT:    v_mov_b32_e32 v28, s16
 ; GCN-NEXT:    v_mov_b32_e32 v29, s17
 ; GCN-NEXT:    v_mov_b32_e32 v30, s18
@@ -806,37 +809,36 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-NEXT:    v_mov_b32_e32 v33, s21
 ; GCN-NEXT:    v_mov_b32_e32 v34, s22
 ; GCN-NEXT:    v_mov_b32_e32 v35, s23
-; GCN-NEXT:    v_mov_b32_e32 v12, s24
-; GCN-NEXT:    v_mov_b32_e32 v18, v0
-; GCN-NEXT:    v_mov_b32_e32 v19, v1
-; GCN-NEXT:    v_mov_b32_e32 v20, v2
-; GCN-NEXT:    v_mov_b32_e32 v21, v3
-; GCN-NEXT:    v_mov_b32_e32 v22, v4
-; GCN-NEXT:    v_mov_b32_e32 v23, v5
-; GCN-NEXT:    v_mov_b32_e32 v24, v6
-; GCN-NEXT:    v_mov_b32_e32 v25, v7
-; GCN-NEXT:    v_mov_b32_e32 v26, v8
-; GCN-NEXT:    v_mov_b32_e32 v27, v9
+; GCN-NEXT:    v_mov_b32_e32 v16, v0
+; GCN-NEXT:    v_mov_b32_e32 v17, v1
+; GCN-NEXT:    v_mov_b32_e32 v18, v2
+; GCN-NEXT:    v_mov_b32_e32 v19, v3
+; GCN-NEXT:    v_mov_b32_e32 v20, v4
+; GCN-NEXT:    v_mov_b32_e32 v21, v5
+; GCN-NEXT:    v_mov_b32_e32 v22, v6
+; GCN-NEXT:    v_mov_b32_e32 v23, v7
+; GCN-NEXT:    v_mov_b32_e32 v24, v8
+; GCN-NEXT:    v_mov_b32_e32 v25, v9
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[10:25], v[36:39], v[28:35], v26
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-NEXT:    v_mov_b32_e32 v1, v11
+; GCN-NEXT:    v_mov_b32_e32 v2, v12
+; GCN-NEXT:    v_mov_b32_e32 v3, v13
+; GCN-NEXT:    v_mov_b32_e32 v4, v14
+; GCN-NEXT:    v_mov_b32_e32 v5, v15
+; GCN-NEXT:    v_mov_b32_e32 v6, v16
+; GCN-NEXT:    v_mov_b32_e32 v7, v17
+; GCN-NEXT:    v_mov_b32_e32 v8, v18
+; GCN-NEXT:    v_mov_b32_e32 v9, v19
+; GCN-NEXT:    v_mov_b32_e32 v10, v20
+; GCN-NEXT:    v_mov_b32_e32 v11, v21
+; GCN-NEXT:    v_mov_b32_e32 v12, v22
+; GCN-NEXT:    v_mov_b32_e32 v13, v23
+; GCN-NEXT:    v_mov_b32_e32 v14, v24
+; GCN-NEXT:    v_mov_b32_e32 v15, v25
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -1295,11 +1297,13 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v26, v10
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s28
+; SDAG-NEXT:    v_mov_b32_e32 v15, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -1308,37 +1312,36 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[10:25], v[36:39], v[28:35], v26
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v0, v10
+; SDAG-NEXT:    v_mov_b32_e32 v1, v11
+; SDAG-NEXT:    v_mov_b32_e32 v2, v12
+; SDAG-NEXT:    v_mov_b32_e32 v3, v13
+; SDAG-NEXT:    v_mov_b32_e32 v4, v14
+; SDAG-NEXT:    v_mov_b32_e32 v5, v15
+; SDAG-NEXT:    v_mov_b32_e32 v6, v16
+; SDAG-NEXT:    v_mov_b32_e32 v7, v17
+; SDAG-NEXT:    v_mov_b32_e32 v8, v18
+; SDAG-NEXT:    v_mov_b32_e32 v9, v19
+; SDAG-NEXT:    v_mov_b32_e32 v10, v20
+; SDAG-NEXT:    v_mov_b32_e32 v11, v21
+; SDAG-NEXT:    v_mov_b32_e32 v12, v22
+; SDAG-NEXT:    v_mov_b32_e32 v13, v23
+; SDAG-NEXT:    v_mov_b32_e32 v14, v24
+; SDAG-NEXT:    v_mov_b32_e32 v15, v25
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
@@ -2344,11 +2347,13 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v26, v10
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s28
+; SDAG-NEXT:    v_mov_b32_e32 v15, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -2357,37 +2362,36 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[10:25], v[36:39], v[28:35], v26
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v0, v10
+; SDAG-NEXT:    v_mov_b32_e32 v1, v11
+; SDAG-NEXT:    v_mov_b32_e32 v2, v12
+; SDAG-NEXT:    v_mov_b32_e32 v3, v13
+; SDAG-NEXT:    v_mov_b32_e32 v4, v14
+; SDAG-NEXT:    v_mov_b32_e32 v5, v15
+; SDAG-NEXT:    v_mov_b32_e32 v6, v16
+; SDAG-NEXT:    v_mov_b32_e32 v7, v17
+; SDAG-NEXT:    v_mov_b32_e32 v8, v18
+; SDAG-NEXT:    v_mov_b32_e32 v9, v19
+; SDAG-NEXT:    v_mov_b32_e32 v10, v20
+; SDAG-NEXT:    v_mov_b32_e32 v11, v21
+; SDAG-NEXT:    v_mov_b32_e32 v12, v22
+; SDAG-NEXT:    v_mov_b32_e32 v13, v23
+; SDAG-NEXT:    v_mov_b32_e32 v14, v24
+; SDAG-NEXT:    v_mov_b32_e32 v15, v25
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
@@ -2717,11 +2721,13 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v26, v10
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s28
+; SDAG-NEXT:    v_mov_b32_e32 v15, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -2730,37 +2736,36 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[10:25], v[36:39], v[28:35], v26
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v0, v10
+; SDAG-NEXT:    v_mov_b32_e32 v1, v11
+; SDAG-NEXT:    v_mov_b32_e32 v2, v12
+; SDAG-NEXT:    v_mov_b32_e32 v3, v13
+; SDAG-NEXT:    v_mov_b32_e32 v4, v14
+; SDAG-NEXT:    v_mov_b32_e32 v5, v15
+; SDAG-NEXT:    v_mov_b32_e32 v6, v16
+; SDAG-NEXT:    v_mov_b32_e32 v7, v17
+; SDAG-NEXT:    v_mov_b32_e32 v8, v18
+; SDAG-NEXT:    v_mov_b32_e32 v9, v19
+; SDAG-NEXT:    v_mov_b32_e32 v10, v20
+; SDAG-NEXT:    v_mov_b32_e32 v11, v21
+; SDAG-NEXT:    v_mov_b32_e32 v12, v22
+; SDAG-NEXT:    v_mov_b32_e32 v13, v23
+; SDAG-NEXT:    v_mov_b32_e32 v14, v24
+; SDAG-NEXT:    v_mov_b32_e32 v15, v25
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
@@ -3090,11 +3095,13 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v26, v10
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s28
+; SDAG-NEXT:    v_mov_b32_e32 v15, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -3103,37 +3110,36 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[10:25], v[36:39], v[28:35], v26
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v0, v10
+; SDAG-NEXT:    v_mov_b32_e32 v1, v11
+; SDAG-NEXT:    v_mov_b32_e32 v2, v12
+; SDAG-NEXT:    v_mov_b32_e32 v3, v13
+; SDAG-NEXT:    v_mov_b32_e32 v4, v14
+; SDAG-NEXT:    v_mov_b32_e32 v5, v15
+; SDAG-NEXT:    v_mov_b32_e32 v6, v16
+; SDAG-NEXT:    v_mov_b32_e32 v7, v17
+; SDAG-NEXT:    v_mov_b32_e32 v8, v18
+; SDAG-NEXT:    v_mov_b32_e32 v9, v19
+; SDAG-NEXT:    v_mov_b32_e32 v10, v20
+; SDAG-NEXT:    v_mov_b32_e32 v11, v21
+; SDAG-NEXT:    v_mov_b32_e32 v12, v22
+; SDAG-NEXT:    v_mov_b32_e32 v13, v23
+; SDAG-NEXT:    v_mov_b32_e32 v14, v24
+; SDAG-NEXT:    v_mov_b32_e32 v15, v25
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
@@ -3463,11 +3469,13 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v26, v10
+; SDAG-NEXT:    v_mov_b32_e32 v10, s24
+; SDAG-NEXT:    v_mov_b32_e32 v11, s25
+; SDAG-NEXT:    v_mov_b32_e32 v12, s26
+; SDAG-NEXT:    v_mov_b32_e32 v13, s27
+; SDAG-NEXT:    v_mov_b32_e32 v14, s28
+; SDAG-NEXT:    v_mov_b32_e32 v15, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s18
@@ -3476,37 +3484,36 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v34, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[10:25], v[36:39], v[28:35], v26
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v0, v10
+; SDAG-NEXT:    v_mov_b32_e32 v1, v11
+; SDAG-NEXT:    v_mov_b32_e32 v2, v12
+; SDAG-NEXT:    v_mov_b32_e32 v3, v13
+; SDAG-NEXT:    v_mov_b32_e32 v4, v14
+; SDAG-NEXT:    v_mov_b32_e32 v5, v15
+; SDAG-NEXT:    v_mov_b32_e32 v6, v16
+; SDAG-NEXT:    v_mov_b32_e32 v7, v17
+; SDAG-NEXT:    v_mov_b32_e32 v8, v18
+; SDAG-NEXT:    v_mov_b32_e32 v9, v19
+; SDAG-NEXT:    v_mov_b32_e32 v10, v20
+; SDAG-NEXT:    v_mov_b32_e32 v11, v21
+; SDAG-NEXT:    v_mov_b32_e32 v12, v22
+; SDAG-NEXT:    v_mov_b32_e32 v13, v23
+; SDAG-NEXT:    v_mov_b32_e32 v14, v24
+; SDAG-NEXT:    v_mov_b32_e32 v15, v25
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 5f42abbeae253..f2c9639aac2f6 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -379,7 +379,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v22, 0x7fc00000
 ; CHECK-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
 ; CHECK-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x3c003c00
@@ -388,69 +388,65 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x7e007e00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[12:13], v[4:7]
-; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    v_mov_b32_e32 v23, v22
+; CHECK-NEXT:    v_mov_b32_e32 v24, v22
+; CHECK-NEXT:    v_mov_b32_e32 v25, v22
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7]
+; CHECK-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25]
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
 ; CHECK-NEXT:    v_accvgpr_write_b32 a2, v2
 ; CHECK-NEXT:    v_accvgpr_write_b32 a3, v3
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; CHECK-NEXT:    v_mov_b32_e32 v5, v4
-; CHECK-NEXT:    v_mov_b32_e32 v6, v4
-; CHECK-NEXT:    v_mov_b32_e32 v7, v4
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17]
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[4:7]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7]
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[4:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7]
+; CHECK-NEXT:    v_mov_b64_e32 v[30:31], 0
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25]
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[8:9], v[8:9], v[26:29]
 ; CHECK-NEXT:    s_nop 5
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v23, v14
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[18:21]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3]
-; CHECK-NEXT:    s_nop 1
+; CHECK-NEXT:    global_store_short v[30:31], v23, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3]
+; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_read_b32 v19, a3
 ; CHECK-NEXT:    v_accvgpr_read_b32 v18, a2
-; CHECK-NEXT:    v_mov_b64_e32 v[20:21], 0
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_wbl2 sc0 sc1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_inv sc0 sc1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v17, a1
 ; CHECK-NEXT:    v_accvgpr_read_b32 v16, a0
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v15, v22
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[8:9], v[8:9], v[16:19]
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v12, v0
-; CHECK-NEXT:    global_store_short v[20:21], v23, off
+; CHECK-NEXT:    global_store_short v[30:31], v15, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[10:11], v[8:9], v[4:7]
-; CHECK-NEXT:    global_store_short v[20:21], v15, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3]
+; CHECK-NEXT:    global_store_short v[30:31], v14, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v14, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[10:11], v[8:9], v[4:7]
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v16
+; CHECK-NEXT:    global_store_short v[30:31], v14, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v12, v0
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v14, off
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CHECK-NEXT:    buffer_wbl2 sc0 sc1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v12, off
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v13, v8
+; CHECK-NEXT:    global_store_short v[30:31], v12, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v0, off
+; CHECK-NEXT:    global_store_short v[30:31], v13, off
 ; CHECK-NEXT:    s_endpgm
 entry:
   %k0 = call <4 x float> asm sideeffect "; def $0", "=s"()
@@ -520,13 +516,13 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5]
+; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[6:9], v0, v1, v[2:5]
 ; CHECK-NEXT:    s_nop 3
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[6:7], v[8:9] op_sel:[1,0]
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
-; CHECK-NEXT:    v_accvgpr_write_b32 a2, v3
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, v9
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use a[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -648,46 +644,14 @@ define amdgpu_kernel void @test_rewrite_mfma_direct_copy_from_agpr_class(ptr add
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
 ; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
 ; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
-; CHECK-NEXT:    v_accvgpr_read_b32 v24, a24
-; CHECK-NEXT:    v_accvgpr_read_b32 v25, a25
-; CHECK-NEXT:    v_accvgpr_read_b32 v26, a26
-; CHECK-NEXT:    v_accvgpr_read_b32 v27, a27
-; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
-; CHECK-NEXT:    v_accvgpr_read_b32 v2, a2
-; CHECK-NEXT:    v_accvgpr_read_b32 v3, a3
-; CHECK-NEXT:    v_accvgpr_read_b32 v4, a4
-; CHECK-NEXT:    v_accvgpr_read_b32 v5, a5
-; CHECK-NEXT:    v_accvgpr_read_b32 v6, a6
-; CHECK-NEXT:    v_accvgpr_read_b32 v7, a7
-; CHECK-NEXT:    v_accvgpr_read_b32 v8, a8
-; CHECK-NEXT:    v_accvgpr_read_b32 v9, a9
-; CHECK-NEXT:    v_accvgpr_read_b32 v10, a10
-; CHECK-NEXT:    v_accvgpr_read_b32 v11, a11
-; CHECK-NEXT:    v_accvgpr_read_b32 v12, a12
-; CHECK-NEXT:    v_accvgpr_read_b32 v13, a13
-; CHECK-NEXT:    v_accvgpr_read_b32 v14, a14
-; CHECK-NEXT:    v_accvgpr_read_b32 v15, a15
-; CHECK-NEXT:    v_accvgpr_read_b32 v16, a16
-; CHECK-NEXT:    v_accvgpr_read_b32 v17, a17
-; CHECK-NEXT:    v_accvgpr_read_b32 v18, a18
-; CHECK-NEXT:    v_accvgpr_read_b32 v19, a19
-; CHECK-NEXT:    v_accvgpr_read_b32 v20, a20
-; CHECK-NEXT:    v_accvgpr_read_b32 v21, a21
-; CHECK-NEXT:    v_accvgpr_read_b32 v22, a22
-; CHECK-NEXT:    v_accvgpr_read_b32 v23, a23
-; CHECK-NEXT:    v_accvgpr_read_b32 v28, a28
-; CHECK-NEXT:    v_accvgpr_read_b32 v29, a29
-; CHECK-NEXT:    v_accvgpr_read_b32 v30, a30
-; CHECK-NEXT:    v_accvgpr_read_b32 v31, a31
-; CHECK-NEXT:    global_store_dwordx4 v32, v[24:27], s[2:3] offset:96
-; CHECK-NEXT:    global_store_dwordx4 v32, v[28:31], s[2:3] offset:112
-; CHECK-NEXT:    global_store_dwordx4 v32, v[16:19], s[2:3] offset:64
-; CHECK-NEXT:    global_store_dwordx4 v32, v[20:23], s[2:3] offset:80
-; CHECK-NEXT:    global_store_dwordx4 v32, v[8:11], s[2:3] offset:32
-; CHECK-NEXT:    global_store_dwordx4 v32, v[12:15], s[2:3] offset:48
-; CHECK-NEXT:    global_store_dwordx4 v32, v[0:3], s[2:3]
-; CHECK-NEXT:    global_store_dwordx4 v32, v[4:7], s[2:3] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v32, a[24:27], s[2:3] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v32, a[28:31], s[2:3] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v32, a[16:19], s[2:3] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v32, a[20:23], s[2:3] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v32, a[8:11], s[2:3] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v32, a[12:15], s[2:3] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v32, a[0:3], s[2:3]
+; CHECK-NEXT:    global_store_dwordx4 v32, a[4:7], s[2:3] offset:16
 ; CHECK-NEXT:    s_endpgm
   %src2 = call <32 x float> asm sideeffect "; def $0", "=a"()
   %mai0 = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 2.0, float 4.0, <32 x float> %src2, i32 0, i32 0, i32 0)
@@ -770,16 +734,16 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v12, v31
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1]
-; CHECK-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
+; CHECK-NEXT:    v_and_b32_e32 v12, 0x3ff, v12
+; CHECK-NEXT:    s_nop 2
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
-; CHECK-NEXT:    s_nop 7
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_store_dwordx2 v[2:3], a[0:1], off
+; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 3, v12
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0
+; CHECK-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5]
+; CHECK-NEXT:    s_nop 5
+; CHECK-NEXT:    global_store_dwordx2 v[4:5], a[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %src2 = call double asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
index 122d46b39ff32..a8ceb62dad061 100644
--- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
@@ -325,63 +325,53 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def a[0:31]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    global_store_dwordx4 v0, v[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v6, v[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[56:59], s[16:17] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v6, v[56:59], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[52:55], s[16:17] offset:80
+; CHECK-NEXT:    global_store_dwordx4 v6, v[52:55], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v6, v[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v6, v[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v6, v[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v6, v[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v6, v[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[56:59], s[16:17] offset:96
+; CHECK-NEXT:    global_store_dwordx4 v6, a[56:59], s[16:17] offset:96
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[60:63], s[16:17] offset:112
+; CHECK-NEXT:    global_store_dwordx4 v6, a[60:63], s[16:17] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[48:51], s[16:17] offset:64
+; CHECK-NEXT:    global_store_dwordx4 v6, a[48:51], s[16:17] offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[52:55], s[16:17] offset:80
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[40:43], s[16:17] offset:32
+; CHECK-NEXT:    global_store_dwordx4 v6, a[52:55], s[16:17] offset:80
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[44:47], s[16:17] offset:48
+; CHECK-NEXT:    global_store_dwordx4 v6, a[40:43], s[16:17] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[32:35], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v6, a[44:47], s[16:17] offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
+; CHECK-NEXT:    global_store_dwordx4 v6, a[32:35], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; CHECK-NEXT:    global_store_dwordx4 v6, a[36:39], s[16:17] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[2:5], s[16:17]
+; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; CHECK-NEXT:    global_store_dwordx4 v6, v[8:11], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v0, v[2:5], s[16:17]
+; CHECK-NEXT:    global_store_dwordx4 v6, v[0:3], s[16:17]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload



More information about the llvm-commits mailing list