[llvm] [AMDGPU] Add VRegToVreg hint for MFMA Dst and OpC (PR #185218)

Josh Hutton via llvm-commits llvm-commits at lists.llvm.org
Sat Mar 7 11:04:31 PST 2026


https://github.com/JoshHuttonCode created https://github.com/llvm/llvm-project/pull/185218

This adds virtual register to virtual register hints, used to encourage assigning OpC and Dst to the same physical register for MFMA instructions. In cases where we are using close to the full register budget and we need to carry accumulation registers across iterations, if we use multiple registers for OpC and Dst, we may not be able to assign all registers without spilling. So, assigning them to the same physical register is a better RA pattern.

This is a work in progress as I am looking into reproducing the problem in a reduced kernel.

>From eea6901963a22f1d345b0dfb446db46be55b32ea Mon Sep 17 00:00:00 2001
From: Josh Hutton <JoshHuttonEmail at gmail.com>
Date: Sat, 7 Mar 2026 10:41:32 -0800
Subject: [PATCH] [AMDGPU] Add VRegToVreg hint for MFMA Dst and OpC

---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   |  52 ++
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |   9 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |   2 +-
 .../GlobalISel/llvm.amdgcn.mfma.gfx90a.ll     |   6 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     | 278 +++----
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll   |  28 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 132 +++-
 ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 708 +++++++++++++++---
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       |  47 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll |   4 +-
 10 files changed, 943 insertions(+), 323 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index cd56887fd46a8..2d2c13693ba0a 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -35,6 +35,7 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/InitializePasses.h"
@@ -245,6 +246,57 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
   TRI = ST.getRegisterInfo();
 
   bool Changed = false;
+  if (ST.hasMAIInsts()) {
+    EquivalenceClasses<Register> MFMAHints;
+    for (const MachineBasicBlock &MBB : MF) {
+      for (const MachineInstr &MI : MBB) {
+        if (!SIInstrInfo::isMFMA(MI))
+          continue;
+        const MachineOperand *DstMO =
+            TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+        const MachineOperand *Src2MO =
+            TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+        if (!DstMO || !Src2MO || !DstMO->isReg() || !Src2MO->isReg())
+          continue;
+        Register Dst = DstMO->getReg();
+        Register Src2 = Src2MO->getReg();
+        if (!Dst.isVirtual() || !Src2.isVirtual())
+          continue;
+        LLVM_DEBUG(dbgs() << "Setting hint for "; MI.dump());
+        LLVM_DEBUG(dbgs() << " Dst: "; DstMO->dump(); dbgs() << " Src2: ";
+                   Src2MO->dump());
+        MFMAHints.unionSets(Dst, Src2);
+      }
+    }
+
+    auto addHints = [&](const SmallVectorImpl<Register> &Members) {
+      for (Register A : Members) {
+        assert(A.isVirtual());
+        for (Register B : Members) {
+          assert(B.isVirtual());
+          if (A == B)
+            continue;
+
+          const TargetRegisterClass *ARC = MRI->getRegClass(A);
+          const TargetRegisterClass *BRC = MRI->getRegClass(B);
+
+          bool CompatibleRC = TRI->getCommonSubClass(ARC, BRC);
+          if (CompatibleRC)
+            MRI->setRegAllocationHint(A, AMDGPURI::VRegToVReg, B);
+        }
+      }
+    };
+
+    for (auto EC = MFMAHints.begin(); EC != MFMAHints.end(); ++EC) {
+      SmallVector<Register, 8> Members;
+
+      for (auto MI = MFMAHints.member_begin(**EC); MI != MFMAHints.member_end();
+           ++MI) {
+        Members.push_back(*MI);
+      }
+      addHints(Members);
+    }
+  }
 
   for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
     Register Reg = Register::index2VirtReg(I);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 99eb90b11182d..090445286cd71 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3890,6 +3890,15 @@ bool SIRegisterInfo::getRegAllocationHints(Register VirtReg,
     }
     return false;
   }
+  case AMDGPURI::VRegToVReg: {
+    Register Paired = Hint.second;
+    assert(Paired);
+    if (VRM && VRM->hasPhys(Paired)) {
+      auto PairedPhys = VRM->getPhys(Paired);
+      Hints.push_back(PairedPhys);
+    }
+    return false;
+  }
   default:
     return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
                                                      VRM);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 9d1a9eae75020..0200560424ad9 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -33,7 +33,7 @@ struct SGPRSpillBuilder;
 /// Register allocation hint types. Helps eliminate unneeded COPY with True16
 namespace AMDGPURI {
 
-enum { Size16 = 1, Size32 = 2 };
+enum { Size16 = 1, Size32 = 2, VRegToVReg = 3 };
 
 } // end namespace AMDGPURI
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index 5c45565b4d047..42652ebeb8ce7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -203,10 +203,10 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GCN-NEXT:    global_store_dwordx2 v0, v[4:5], s[0:1]
 ; GCN-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index bb7beb8d0b9e2..81daffdc7b330 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -15,54 +15,54 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-NEXT:    ;;#ASMEND
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a15
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a31, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a32, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a14
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a30, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a13
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a29, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a12
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a28, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a11
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a27, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a10
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a26, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a9
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a25, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a8
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a24, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a7
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a23, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a6
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a22, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a5
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a21, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a4
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a20, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a3
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a19, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a2
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a18, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a1
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a17, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v39
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a0
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a16, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v39
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
+; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[17:32]
 ; GFX908-NEXT:    s_nop 9
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a0 ; Reload Reuse
 ; GFX908-NEXT:    v_accvgpr_read_b32 v38, a11 ; Reload Reuse
@@ -158,28 +158,30 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v33, v0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v34, a32 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_read_b32 v35, a33 ; Reload Reuse
 ; GFX90A-NEXT:    v_mov_b32_e32 v32, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:31] a[0:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a31, a15
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a30, a14
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a29, a13
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a28, a12
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a27, a11
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a26, a10
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a25, a9
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a24, a8
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a23, a7
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a22, a6
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a21, a5
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a20, a4
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a19, a3
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a18, a2
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a17, a1
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a33, a15
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a32, a14
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a31, a13
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a30, a12
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a29, a11
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a28, a10
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a27, a9
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a26, a8
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a25, a7
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a24, a6
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a23, a5
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a22, a4
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a21, a3
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a20, a2
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a19, a1
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a18, a0
 ; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
+; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[18:33]
 ; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_nop 0
@@ -192,12 +194,12 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    buffer_store_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    buffer_store_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    buffer_store_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX90A-NEXT:    v_accvgpr_read_b32 v39, a10 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_read_b32 v38, a11 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_read_b32 v37, a12 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_read_b32 v36, a13 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_read_b32 v35, a14 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_read_b32 v34, a15 ; Reload Reuse
+; GFX90A-NEXT:    buffer_store_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX90A-NEXT:    buffer_store_dword a11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT:    v_accvgpr_read_b32 v39, a12 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_read_b32 v38, a13 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_read_b32 v37, a14 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_read_b32 v36, a15 ; Reload Reuse
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; copy
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -212,13 +214,15 @@ define void @no_free_vgprs_at_agpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    buffer_load_dword a7, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v39 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v38 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v37 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v36 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v35 ; Reload Reuse
+; GFX90A-NEXT:    buffer_load_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX90A-NEXT:    buffer_load_dword a11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v39 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v38 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v37 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_write_b32 a33, v35 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_write_b32 a32, v34 ; Reload Reuse
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v34 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v36 ; Reload Reuse
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; copy
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -380,59 +384,59 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
 ; GFX908-NEXT:    ; def v32
 ; GFX908-NEXT:    ;;#ASMEND
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a31, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a32, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a14
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a30, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a13
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a29, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a12
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a28, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a11
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a27, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a10
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a26, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a9
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a25, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a8
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a24, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a7
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a23, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a6
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a22, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a5
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a21, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a4
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a20, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a3
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a19, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a2
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a18, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a1
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a17, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v35
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a0
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a16, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v35
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; copy
 ; GFX908-NEXT:    ;;#ASMEND
 ; GFX908-NEXT:    v_accvgpr_read_b32 v35, a1
-; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31]
+; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[17:32]
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    v_accvgpr_write_b32 a32, v35
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v35
 ; GFX908-NEXT:    ;;#ASMSTART
 ; GFX908-NEXT:    ; copy
 ; GFX908-NEXT:    ;;#ASMEND
@@ -452,40 +456,44 @@ define void @v32_asm_def_use(float %v0, float %v1) #4 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v34, v0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v35, a32 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_read_b32 v36, a33 ; Reload Reuse
 ; GFX90A-NEXT:    v_mov_b32_e32 v33, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:31] a[0:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a31, a15
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a30, a14
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a29, a13
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a28, a12
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a27, a11
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a26, a10
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a25, a9
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a24, a8
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a23, a7
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a22, a6
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a21, a5
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a20, a4
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a19, a3
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a18, a2
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a17, a1
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a0
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a33, a15
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a32, a14
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a31, a13
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a30, a12
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a29, a11
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a28, a10
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a27, a9
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a26, a8
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a25, a7
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a24, a6
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a23, a5
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a22, a4
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a21, a3
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a20, a2
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a19, a1
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a18, a0
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v32
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; copy
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_read_b32 v35, a32 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a32, a1
-; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[16:31]
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a1
+; GFX90A-NEXT:    s_nop 0
+; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v34, v33, a[18:33]
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; copy
 ; GFX90A-NEXT:    ;;#ASMEND
+; GFX90A-NEXT:    s_nop 6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a33, v36 ; Reload Reuse
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a32, v35 ; Reload Reuse
-; GFX90A-NEXT:    s_nop 9
+; GFX90A-NEXT:    s_nop 1
 ; GFX90A-NEXT:    v_accvgpr_mov_b32 a3, a2
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use a3 v[0:31]
@@ -912,54 +920,54 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-NEXT:    ;;#ASMEND
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s15
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a31, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a32, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s14
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a30, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s13
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a29, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s12
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a28, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s11
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a27, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s10
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a26, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s9
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a25, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s8
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a24, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s7
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a23, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s6
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a22, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s5
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a21, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s4
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a20, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s3
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a19, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s2
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a18, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s1
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a17, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v39
 ; GFX908-NEXT:    v_mov_b32_e32 v39, s0
 ; GFX908-NEXT:    s_nop 1
-; GFX908-NEXT:    v_accvgpr_write_b32 a16, v39
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v39
 ; GFX908-NEXT:    s_nop 0
-; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
+; GFX908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[17:32]
 ; GFX908-NEXT:    s_nop 9
 ; GFX908-NEXT:    v_accvgpr_read_b32 v39, a0 ; Reload Reuse
 ; GFX908-NEXT:    v_accvgpr_read_b32 v38, a11 ; Reload Reuse
@@ -1001,11 +1009,11 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX908-NEXT:    ; copy
 ; GFX908-NEXT:    ;;#ASMEND
 ; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX908-NEXT:    v_accvgpr_read_b32 v33, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v32, a1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_accvgpr_write_b32 a0, v39 ; Reload Reuse
 ; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX908-NEXT:    v_accvgpr_write_b32 a32, v33
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v32
 ; GFX908-NEXT:    v_accvgpr_write_b32 a11, v38 ; Reload Reuse
 ; GFX908-NEXT:    v_accvgpr_write_b32 a12, v37 ; Reload Reuse
 ; GFX908-NEXT:    v_accvgpr_write_b32 a13, v36 ; Reload Reuse
@@ -1055,29 +1063,30 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v33, v0
+; GFX90A-NEXT:    v_accvgpr_read_b32 v34, a32 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_read_b32 v35, a33 ; Reload Reuse
 ; GFX90A-NEXT:    v_mov_b32_e32 v32, v1
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; def v[0:31] s[0:15]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s15
-; GFX90A-NEXT:    v_accvgpr_write_b32 a30, s14
-; GFX90A-NEXT:    v_accvgpr_write_b32 a29, s13
-; GFX90A-NEXT:    v_accvgpr_write_b32 a28, s12
-; GFX90A-NEXT:    v_accvgpr_write_b32 a27, s11
-; GFX90A-NEXT:    v_accvgpr_write_b32 a26, s10
-; GFX90A-NEXT:    v_accvgpr_write_b32 a25, s9
-; GFX90A-NEXT:    v_accvgpr_write_b32 a24, s8
-; GFX90A-NEXT:    v_accvgpr_write_b32 a23, s7
-; GFX90A-NEXT:    v_accvgpr_write_b32 a22, s6
-; GFX90A-NEXT:    v_accvgpr_write_b32 a21, s5
-; GFX90A-NEXT:    v_accvgpr_write_b32 a20, s4
-; GFX90A-NEXT:    v_accvgpr_write_b32 a19, s3
-; GFX90A-NEXT:    v_accvgpr_write_b32 a18, s2
-; GFX90A-NEXT:    v_accvgpr_write_b32 a17, s1
-; GFX90A-NEXT:    v_accvgpr_write_b32 a16, s0
-; GFX90A-NEXT:    v_accvgpr_read_b32 v34, a32 ; Reload Reuse
-; GFX90A-NEXT:    s_nop 0
-; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[16:31]
+; GFX90A-NEXT:    v_accvgpr_write_b32 a33, s15
+; GFX90A-NEXT:    v_accvgpr_write_b32 a32, s14
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, s13
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, s12
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, s11
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, s10
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, s9
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, s8
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, s7
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, s6
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, s5
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, s4
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, s3
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, s2
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, s1
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, s0
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v33, v32, a[18:33]
 ; GFX90A-NEXT:    s_nop 10
 ; GFX90A-NEXT:    buffer_store_dword a0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    s_nop 0
@@ -1091,15 +1100,15 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    buffer_store_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    buffer_store_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
 ; GFX90A-NEXT:    buffer_store_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX90A-NEXT:    v_accvgpr_read_b32 v39, a11 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_read_b32 v38, a12 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_read_b32 v37, a13 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_read_b32 v36, a14 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_read_b32 v35, a15 ; Reload Reuse
+; GFX90A-NEXT:    buffer_store_dword a11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX90A-NEXT:    v_accvgpr_read_b32 v39, a12 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_read_b32 v38, a13 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_read_b32 v37, a14 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_read_b32 v36, a15 ; Reload Reuse
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; copy
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_mov_b32 a32, a1
+; GFX90A-NEXT:    v_accvgpr_mov_b32 a16, a1
 ; GFX90A-NEXT:    buffer_load_dword a0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword a1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword a2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -1111,12 +1120,14 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    buffer_load_dword a8, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword a9, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword a10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v39 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v38 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v37 ; Reload Reuse
-; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v36 ; Reload Reuse
+; GFX90A-NEXT:    buffer_load_dword a11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v39 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v38 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v37 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_write_b32 a33, v35 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_write_b32 a32, v34 ; Reload Reuse
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v35 ; Reload Reuse
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v36 ; Reload Reuse
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; copy
 ; GFX90A-NEXT:    ;;#ASMEND
@@ -1124,7 +1135,6 @@ define void @no_free_vgprs_at_sgpr_to_agpr_copy(float %v0, float %v1) #0 {
 ; GFX90A-NEXT:    ;;#ASMSTART
 ; GFX90A-NEXT:    ; use a3 v[0:31]
 ; GFX90A-NEXT:    ;;#ASMEND
-; GFX90A-NEXT:    v_accvgpr_write_b32 a32, v34 ; Reload Reuse
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
   %asm = call { <32 x i32>, <16 x float> } asm sideeffect "; def $0 $1","=${v[0:31]},=${s[0:15]}"()
   %vgpr0 = extractvalue { <32 x i32>, <16 x float> } %asm, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
index cc4cc8efd66f4..4d9846f6bd3a1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
@@ -24,7 +24,12 @@ define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg
 ; VGPRRC-LABEL: default:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
 ; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
   ret <4 x float> %result
@@ -50,7 +55,12 @@ define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float>
 ; VGPRRC-LABEL: request_agpr:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
 ; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
   ret <4 x float> %result
@@ -60,13 +70,23 @@ define <4 x float> @request_no_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x flo
 ; HEURRC-LABEL: request_no_agpr:
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; HEURRC-NEXT:    s_nop 7
+; HEURRC-NEXT:    v_mov_b32_e32 v0, v8
+; HEURRC-NEXT:    v_mov_b32_e32 v1, v9
+; HEURRC-NEXT:    v_mov_b32_e32 v2, v10
+; HEURRC-NEXT:    v_mov_b32_e32 v3, v11
 ; HEURRC-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VGPRRC-LABEL: request_no_agpr:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
 ; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
   ret <4 x float> %result
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index a1fe463de1c54..9309d5beca2c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -15,7 +15,12 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg
 ; GCN-LABEL: test_mfma_f32_16x16x32_f16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT:    v_mfma_f32_16x16x32_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-NEXT:    v_mov_b32_e32 v1, v9
+; GCN-NEXT:    v_mov_b32_e32 v2, v10
+; GCN-NEXT:    v_mov_b32_e32 v3, v11
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16:
@@ -37,7 +42,12 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
 ; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
 ; AGPR-LABEL: test_mfma_f32_16x16x32_f16:
 ; AGPR:       ; %bb.0:
@@ -67,7 +77,12 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal
 ; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    v_mfma_f32_16x16x32_f16 v[8:11], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-NEXT:    v_mov_b32_e32 v1, v9
+; GCN-NEXT:    v_mov_b32_e32 v2, v10
+; GCN-NEXT:    v_mov_b32_e32 v3, v11
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16__flags:
@@ -89,7 +104,12 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16__flags:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[8:11], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
 ; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
 ; AGPR-LABEL: test_mfma_f32_16x16x32_f16__flags:
 ; AGPR:       ; %bb.0:
@@ -148,10 +168,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[6:7]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -267,10 +287,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 v[8:11], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[6:7]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -2451,7 +2471,12 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4
 ; GCN-LABEL: test_mfma_i32_16x16x64_i8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT:    v_mfma_i32_16x16x64_i8 v[8:11], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-NEXT:    v_mov_b32_e32 v1, v9
+; GCN-NEXT:    v_mov_b32_e32 v2, v10
+; GCN-NEXT:    v_mov_b32_e32 v3, v11
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8:
@@ -2473,7 +2498,12 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4
 ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VGPRRC-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    v_mfma_i32_16x16x64_i8 v[8:11], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
 ; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
 ; AGPR-LABEL: test_mfma_i32_16x16x64_i8:
 ; AGPR:       ; %bb.0:
@@ -2503,7 +2533,12 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a
 ; GCN-LABEL: test_mfma_i32_16x16x64_i8__flags:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    v_mfma_i32_16x16x64_i8 v[8:11], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-NEXT:    v_mov_b32_e32 v1, v9
+; GCN-NEXT:    v_mov_b32_e32 v2, v10
+; GCN-NEXT:    v_mov_b32_e32 v3, v11
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8__flags:
@@ -2525,7 +2560,12 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a
 ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8__flags:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VGPRRC-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT:    v_mfma_i32_16x16x64_i8 v[8:11], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
 ; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
 ; AGPR-LABEL: test_mfma_i32_16x16x64_i8__flags:
 ; AGPR:       ; %bb.0:
@@ -2572,9 +2612,9 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; SDAG-NEXT:    v_mov_b32_e32 v10, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v11, s3
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 v[8:11], v[0:3], v[4:7], v[8:11]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v12, v[8:11], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2590,10 +2630,10 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 v[8:11], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[6:7]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2616,9 +2656,9 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; HEURRC-NEXT:    v_mov_b32_e32 v10, s2
 ; HEURRC-NEXT:    v_mov_b32_e32 v11, s3
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; HEURRC-NEXT:    v_mfma_i32_16x16x64_i8 v[8:11], v[0:3], v[4:7], v[8:11]
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[8:11], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2641,9 +2681,9 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
 ; VGPRRC-NEXT:    v_mov_b32_e32 v10, s2
 ; VGPRRC-NEXT:    v_mov_b32_e32 v11, s3
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    v_mfma_i32_16x16x64_i8 v[8:11], v[0:3], v[4:7], v[8:11]
 ; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT:    global_store_dwordx4 v12, v[8:11], s[6:7]
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
 ; AGPR:       ; %bb.0:
@@ -2719,9 +2759,9 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; SDAG-NEXT:    v_mov_b32_e32 v10, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v11, s3
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 v[8:11], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v12, v[8:11], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2737,10 +2777,10 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 v[8:11], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[6:7]
 ; GISEL-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2763,9 +2803,9 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; HEURRC-NEXT:    v_mov_b32_e32 v10, s2
 ; HEURRC-NEXT:    v_mov_b32_e32 v11, s3
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mfma_i32_16x16x64_i8 v[8:11], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v12, v[8:11], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2788,9 +2828,9 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
 ; VGPRRC-NEXT:    v_mov_b32_e32 v10, s2
 ; VGPRRC-NEXT:    v_mov_b32_e32 v11, s3
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT:    v_mfma_i32_16x16x64_i8 v[8:11], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
 ; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT:    global_store_dwordx4 v12, v[8:11], s[6:7]
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
 ; AGPR:       ; %bb.0:
@@ -5135,7 +5175,12 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
 ; GCN-LABEL: test_mfma_f32_16x16x32_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-NEXT:    v_mov_b32_e32 v1, v9
+; GCN-NEXT:    v_mov_b32_e32 v2, v10
+; GCN-NEXT:    v_mov_b32_e32 v3, v11
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16:
@@ -5157,7 +5202,12 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
 ; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
 ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16:
 ; AGPR:       ; %bb.0:
@@ -5187,7 +5237,12 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
 ; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[8:11], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-NEXT:    v_mov_b32_e32 v1, v9
+; GCN-NEXT:    v_mov_b32_e32 v2, v10
+; GCN-NEXT:    v_mov_b32_e32 v3, v11
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16__flags:
@@ -5209,7 +5264,12 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16__flags:
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[8:11], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
+; VGPRRC-NEXT:    s_nop 7
+; VGPRRC-NEXT:    v_mov_b32_e32 v0, v8
+; VGPRRC-NEXT:    v_mov_b32_e32 v1, v9
+; VGPRRC-NEXT:    v_mov_b32_e32 v2, v10
+; VGPRRC-NEXT:    v_mov_b32_e32 v3, v11
 ; VGPRRC-NEXT:    s_setpc_b64 s[30:31]
 ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16__flags:
 ; AGPR:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 97a89ec819bae..a73d7e97f7961 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -17,7 +17,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -30,7 +35,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -43,7 +53,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0]
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -56,7 +71,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -69,7 +89,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -82,7 +107,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -95,7 +125,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -108,7 +143,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -122,7 +162,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19]
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -136,7 +181,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] blgp:1
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] blgp:1
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -150,7 +200,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] blgp:1
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19] blgp:1
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -164,7 +219,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[14:17], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -178,7 +238,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17] blgp:2
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[14:17], v[0:7], v[8:13], v[14:17] blgp:2
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -192,7 +257,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] blgp:3
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[14:17], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] blgp:3
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -206,7 +276,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17] blgp:3
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[14:17], v[0:7], v[8:13], v[14:17] blgp:3
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -220,7 +295,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[12:15], v[0:7], v[8:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -234,7 +314,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:11], v[12:15] blgp:4
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[12:15], v[0:7], v[8:11], v[12:15] blgp:4
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -248,7 +333,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:1
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:1
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
@@ -262,7 +352,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] cbsz:1
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19] cbsz:1
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
@@ -276,7 +371,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
@@ -291,7 +391,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] cbsz:1 blgp:1
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19] cbsz:1 blgp:1
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
@@ -305,7 +410,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[14:17], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
@@ -318,7 +428,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17] cbsz:1 blgp:2
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[14:17], v[0:7], v[8:13], v[14:17] cbsz:1 blgp:2
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
@@ -332,7 +447,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[14:17], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
@@ -346,7 +466,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17] cbsz:1 blgp:3
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[14:17], v[0:7], v[8:13], v[14:17] cbsz:1 blgp:3
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
@@ -360,7 +485,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[12:15], v[0:7], v[8:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
@@ -374,7 +504,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:11], v[12:15] cbsz:1 blgp:4
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[12:15], v[0:7], v[8:11], v[12:15] cbsz:1 blgp:4
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
@@ -388,7 +523,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[14:17], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -402,7 +542,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17] cbsz:2
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[14:17], v[0:5], v[6:13], v[14:17] cbsz:2
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -416,7 +561,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[14:17], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -430,7 +580,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17] cbsz:2 blgp:1
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[14:17], v[0:5], v[6:13], v[14:17] cbsz:2 blgp:1
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -444,7 +599,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[12:15], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -458,7 +618,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15] cbsz:2 blgp:2
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[12:15], v[0:5], v[6:11], v[12:15] cbsz:2 blgp:2
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -472,7 +637,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[12:15], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -486,7 +656,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15] cbsz:2 blgp:3
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[12:15], v[0:5], v[6:11], v[12:15] cbsz:2 blgp:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -501,7 +676,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:3
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[14:17], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:3
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
@@ -515,7 +695,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17] cbsz:3
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[14:17], v[0:5], v[6:13], v[14:17] cbsz:3
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
@@ -529,7 +714,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[14:17], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
@@ -543,7 +733,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17] cbsz:3 blgp:1
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[14:17], v[0:5], v[6:13], v[14:17] cbsz:3 blgp:1
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
@@ -557,7 +752,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[12:15], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
@@ -571,7 +771,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15] cbsz:3 blgp:2
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[12:15], v[0:5], v[6:11], v[12:15] cbsz:3 blgp:2
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
@@ -585,7 +790,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[10:13], v[0:5], v[6:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-NEXT:    v_mov_b32_e32 v1, v11
+; GCN-NEXT:    v_mov_b32_e32 v2, v12
+; GCN-NEXT:    v_mov_b32_e32 v3, v13
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
@@ -599,7 +809,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:9], v[10:13] cbsz:3 blgp:4
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[10:13], v[0:5], v[6:9], v[10:13] cbsz:3 blgp:4
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-NEXT:    v_mov_b32_e32 v1, v11
+; GCN-NEXT:    v_mov_b32_e32 v2, v12
+; GCN-NEXT:    v_mov_b32_e32 v3, v13
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
@@ -613,7 +828,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[12:15], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
@@ -627,7 +847,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15] cbsz:3 blgp:3
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[12:15], v[0:5], v[6:11], v[12:15] cbsz:3 blgp:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
@@ -641,7 +866,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[10:13], v[0:5], v[6:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-NEXT:    v_mov_b32_e32 v1, v11
+; GCN-NEXT:    v_mov_b32_e32 v2, v12
+; GCN-NEXT:    v_mov_b32_e32 v3, v13
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -655,7 +885,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:9], v[10:13] cbsz:2 blgp:4
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[10:13], v[0:5], v[6:9], v[10:13] cbsz:2 blgp:4
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-NEXT:    v_mov_b32_e32 v1, v11
+; GCN-NEXT:    v_mov_b32_e32 v2, v12
+; GCN-NEXT:    v_mov_b32_e32 v3, v13
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -669,7 +904,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[12:15], v[0:3], v[4:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -683,7 +923,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:11], v[12:15] cbsz:4
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[12:15], v[0:3], v[4:11], v[12:15] cbsz:4
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -697,7 +942,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[12:15], v[0:3], v[4:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -711,7 +961,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:11], v[12:15] cbsz:4 blgp:1
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[12:15], v[0:3], v[4:11], v[12:15] cbsz:4 blgp:1
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -725,7 +980,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[10:13], v[0:3], v[4:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-NEXT:    v_mov_b32_e32 v1, v11
+; GCN-NEXT:    v_mov_b32_e32 v2, v12
+; GCN-NEXT:    v_mov_b32_e32 v3, v13
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -739,7 +999,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:9], v[10:13] cbsz:4 blgp:2
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[10:13], v[0:3], v[4:9], v[10:13] cbsz:4 blgp:2
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-NEXT:    v_mov_b32_e32 v1, v11
+; GCN-NEXT:    v_mov_b32_e32 v2, v12
+; GCN-NEXT:    v_mov_b32_e32 v3, v13
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -753,7 +1018,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[10:13], v[0:3], v[4:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-NEXT:    v_mov_b32_e32 v1, v11
+; GCN-NEXT:    v_mov_b32_e32 v2, v12
+; GCN-NEXT:    v_mov_b32_e32 v3, v13
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -767,7 +1037,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:9], v[10:13] cbsz:4 blgp:3
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[10:13], v[0:3], v[4:9], v[10:13] cbsz:4 blgp:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v10
+; GCN-NEXT:    v_mov_b32_e32 v1, v11
+; GCN-NEXT:    v_mov_b32_e32 v2, v12
+; GCN-NEXT:    v_mov_b32_e32 v3, v13
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -781,7 +1056,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:7], v[8:11], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[8:11], v[0:3], v[4:7], v[8:11], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-NEXT:    v_mov_b32_e32 v1, v9
+; GCN-NEXT:    v_mov_b32_e32 v2, v10
+; GCN-NEXT:    v_mov_b32_e32 v3, v11
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -795,7 +1075,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:4 blgp:4
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[8:11], v[0:3], v[4:7], v[8:11] cbsz:4 blgp:4
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v8
+; GCN-NEXT:    v_mov_b32_e32 v1, v9
+; GCN-NEXT:    v_mov_b32_e32 v2, v10
+; GCN-NEXT:    v_mov_b32_e32 v3, v11
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -815,7 +1100,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_
 ; GCN-NEXT:    v_mov_b32_e32 v20, s0
 ; GCN-NEXT:    v_mov_b32_e32 v21, s1
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <4 x float> %result
@@ -827,7 +1117,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v21, s0
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[0,0,0]
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <4 x float> %result
@@ -839,7 +1134,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v21, s0
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <4 x float> %result
@@ -872,7 +1172,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s0
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[16:23], v[8:15], v[4:7], v2, v3 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[4:7], v[16:23], v[8:15], v[4:7], v2, v3 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_mov_b32_e32 v0, v4
+; SDAG-NEXT:    v_mov_b32_e32 v1, v5
+; SDAG-NEXT:    v_mov_b32_e32 v2, v6
+; SDAG-NEXT:    v_mov_b32_e32 v3, v7
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
@@ -901,7 +1206,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[30:31]
 ; GISEL-NEXT:    v_readlane_b32 s31, v24, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v24, 0
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, v3 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[20:23], v[4:11], v[12:19], v[20:23], v2, v3 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_mov_b32_e32 v0, v20
+; GISEL-NEXT:    v_mov_b32_e32 v1, v21
+; GISEL-NEXT:    v_mov_b32_e32 v2, v22
+; GISEL-NEXT:    v_mov_b32_e32 v3, v23
 ; GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GISEL-NEXT:    scratch_load_dword v24, off, s32 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[0:1]
@@ -925,7 +1235,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; SDAG-NEXT:    v_mov_b32_e32 v21, s19
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s20
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[8:11], v13, v12 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[8:11], v[14:21], v[0:7], v[8:11], v13, v12 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_mov_b32_e32 v0, v8
+; SDAG-NEXT:    v_mov_b32_e32 v1, v9
+; SDAG-NEXT:    v_mov_b32_e32 v2, v10
+; SDAG-NEXT:    v_mov_b32_e32 v3, v11
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
@@ -941,7 +1256,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
 ; GISEL-NEXT:    v_mov_b32_e32 v13, s20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[8:11], v13, v12 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[8:11], v[14:21], v[0:7], v[8:11], v13, v12 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_mov_b32_e32 v0, v8
+; GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mov_b32_e32 v3, v11
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <4 x float> %result
@@ -961,7 +1281,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v21, s19
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s20
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[8:11], v12, v13 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[8:11], v[14:21], v[0:7], v[8:11], v12, v13 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_mov_b32_e32 v0, v8
+; SDAG-NEXT:    v_mov_b32_e32 v1, v9
+; SDAG-NEXT:    v_mov_b32_e32 v2, v10
+; SDAG-NEXT:    v_mov_b32_e32 v3, v11
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
@@ -977,7 +1302,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
 ; GISEL-NEXT:    v_mov_b32_e32 v13, s20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[8:11], v12, v13 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[8:11], v[14:21], v[0:7], v[8:11], v12, v13 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_mov_b32_e32 v0, v8
+; GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mov_b32_e32 v3, v11
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <4 x float> %result
@@ -997,7 +1327,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v21, s19
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s20
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[14:21], v[8:11], v12, v13 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[8:11], v[0:7], v[14:21], v[8:11], v12, v13 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_mov_b32_e32 v0, v8
+; SDAG-NEXT:    v_mov_b32_e32 v1, v9
+; SDAG-NEXT:    v_mov_b32_e32 v2, v10
+; SDAG-NEXT:    v_mov_b32_e32 v3, v11
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
@@ -1013,7 +1348,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
 ; GISEL-NEXT:    v_mov_b32_e32 v13, s20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[14:21], v[8:11], v12, v13 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[8:11], v[0:7], v[14:21], v[8:11], v12, v13 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_mov_b32_e32 v0, v8
+; GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mov_b32_e32 v3, v11
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <4 x float> %result
@@ -1029,7 +1369,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v21, s3
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s16
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[18:21], v16, v17 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[18:21], v[0:7], v[8:15], v[18:21], v16, v17 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_mov_b32_e32 v0, v18
+; SDAG-NEXT:    v_mov_b32_e32 v1, v19
+; SDAG-NEXT:    v_mov_b32_e32 v2, v20
+; SDAG-NEXT:    v_mov_b32_e32 v3, v21
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
@@ -1039,7 +1384,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgp
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v17, s16
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[18:21], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[18:21], v[0:7], v[8:15], v[18:21], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_mov_b32_e32 v0, v18
+; GISEL-NEXT:    v_mov_b32_e32 v1, v19
+; GISEL-NEXT:    v_mov_b32_e32 v2, v20
+; GISEL-NEXT:    v_mov_b32_e32 v3, v21
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <4 x float> %result
@@ -1063,7 +1413,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s23
 ; SDAG-NEXT:    v_mov_b32_e32 v9, s24
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[10:13], v8, v9 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[10:13], v[14:21], v[0:7], v[10:13], v8, v9 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_mov_b32_e32 v0, v10
+; SDAG-NEXT:    v_mov_b32_e32 v1, v11
+; SDAG-NEXT:    v_mov_b32_e32 v2, v12
+; SDAG-NEXT:    v_mov_b32_e32 v3, v13
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
@@ -1081,7 +1436,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[22:23]
 ; GISEL-NEXT:    v_mov_b32_e32 v9, s24
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[10:17], v[0:7], v[18:21], v8, v9 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[18:21], v[10:17], v[0:7], v[18:21], v8, v9 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_mov_b32_e32 v0, v18
+; GISEL-NEXT:    v_mov_b32_e32 v1, v19
+; GISEL-NEXT:    v_mov_b32_e32 v2, v20
+; GISEL-NEXT:    v_mov_b32_e32 v3, v21
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <4 x float> %result
@@ -1094,7 +1454,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
 ; SDAG-NEXT:    v_mov_b32_e32 v20, -2
 ; SDAG-NEXT:    v_mov_b32_e32 v21, 33
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_mov_b32_e32 v0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v18
+; SDAG-NEXT:    v_mov_b32_e32 v3, v19
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
@@ -1103,7 +1468,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
 ; GISEL-NEXT:    v_mov_b32_e32 v20, 33
 ; GISEL-NEXT:    v_mov_b32_e32 v21, -2
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_mov_b32_e32 v0, v16
+; GISEL-NEXT:    v_mov_b32_e32 v1, v17
+; GISEL-NEXT:    v_mov_b32_e32 v2, v18
+; GISEL-NEXT:    v_mov_b32_e32 v3, v19
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
   ret <4 x float> %result
@@ -1116,7 +1486,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    v_mov_b32_e32 v20, -2
 ; SDAG-NEXT:    v_mov_b32_e32 v21, 0x41
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_mov_b32_e32 v0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v18
+; SDAG-NEXT:    v_mov_b32_e32 v3, v19
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
@@ -1125,7 +1500,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_mov_b32_e32 v20, 0x41
 ; GISEL-NEXT:    v_mov_b32_e32 v21, -2
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_mov_b32_e32 v0, v16
+; GISEL-NEXT:    v_mov_b32_e32 v1, v17
+; GISEL-NEXT:    v_mov_b32_e32 v2, v18
+; GISEL-NEXT:    v_mov_b32_e32 v3, v19
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
   ret <4 x float> %result
@@ -1138,7 +1518,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0x4d
 ; SDAG-NEXT:    v_mov_b32_e32 v21, 0x41
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[1,1,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_mov_b32_e32 v0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v18
+; SDAG-NEXT:    v_mov_b32_e32 v3, v19
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
@@ -1147,7 +1532,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; GISEL-NEXT:    v_mov_b32_e32 v20, 0x41
 ; GISEL-NEXT:    v_mov_b32_e32 v21, 0x4d
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_mov_b32_e32 v0, v16
+; GISEL-NEXT:    v_mov_b32_e32 v1, v17
+; GISEL-NEXT:    v_mov_b32_e32 v2, v18
+; GISEL-NEXT:    v_mov_b32_e32 v3, v19
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
   ret <4 x float> %result
@@ -1184,9 +1574,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
 ; SDAG-NEXT:    v_mov_b32_e32 v21, s12
 ; SDAG-NEXT:    v_mov_b32_e32 v22, s13
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[14:15]
+; SDAG-NEXT:    global_store_dwordx4 v20, v[16:19], s[14:15]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
@@ -1207,10 +1597,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
 ; GISEL-NEXT:    v_mov_b32_e32 v20, s28
 ; GISEL-NEXT:    v_mov_b32_e32 v21, s29
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 10
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[30:31]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[16:19], s[30:31]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 2, i32 3, i32 %scale0, i32 1, i32 %scale1)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1246,9 +1636,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v20, v[16:19], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
@@ -1270,10 +1660,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 10
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[16:19], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 -2)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1309,9 +1699,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v20, v[16:19], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_kimm__scaleB__FP_literal:
@@ -1333,10 +1723,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 10
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[16:19], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 65, i32 1, i32 1065353216)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1372,9 +1762,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v20, v[16:19], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__inline_imm:
@@ -1396,10 +1786,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 10
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[16:19], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 -2)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1435,9 +1825,9 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v20, v[16:19], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA_FP_literal__scaleB__FP_literal:
@@ -1459,10 +1849,10 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GISEL-NEXT:    s_nop 10
-; GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GISEL-NEXT:    global_store_dwordx4 v0, v[16:19], s[6:7]
 ; GISEL-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 1065353216, i32 1, i32 1042479491)
   store <4 x float> %result, ptr addrspace(1) %ptr, align 16
@@ -1474,7 +1864,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19]
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   ret <4 x float> %result
@@ -1485,7 +1880,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19]
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19]
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
   ret <4 x float> %result
@@ -1498,7 +1898,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 1
 ; SDAG-NEXT:    v_mov_b32_e32 v21, 0
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_mov_b32_e32 v0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v18
+; SDAG-NEXT:    v_mov_b32_e32 v3, v19
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
@@ -1507,7 +1912,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8
 ; GISEL-NEXT:    v_mov_b32_e32 v20, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v21, 1
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_mov_b32_e32 v0, v16
+; GISEL-NEXT:    v_mov_b32_e32 v1, v17
+; GISEL-NEXT:    v_mov_b32_e32 v2, v18
+; GISEL-NEXT:    v_mov_b32_e32 v3, v19
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
   ret <4 x float> %result
@@ -1520,7 +1930,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    v_mov_b32_e32 v21, 1
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 11
+; SDAG-NEXT:    v_mov_b32_e32 v0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v1, v17
+; SDAG-NEXT:    v_mov_b32_e32 v2, v18
+; SDAG-NEXT:    v_mov_b32_e32 v3, v19
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
@@ -1529,7 +1944,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
 ; GISEL-NEXT:    v_mov_b32_e32 v20, 1
 ; GISEL-NEXT:    v_mov_b32_e32 v21, 0
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 11
+; GISEL-NEXT:    v_mov_b32_e32 v0, v16
+; GISEL-NEXT:    v_mov_b32_e32 v1, v17
+; GISEL-NEXT:    v_mov_b32_e32 v2, v18
+; GISEL-NEXT:    v_mov_b32_e32 v3, v19
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
   ret <4 x float> %result
@@ -1543,7 +1963,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -1556,7 +1981,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -1569,7 +1999,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -1582,7 +2017,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] cbsz:2 blgp:2
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19] cbsz:2 blgp:2
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
@@ -1595,7 +2035,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -1608,7 +2053,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -1621,7 +2071,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[14:17], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
@@ -1634,7 +2089,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[14:17], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT:    s_nop 11
+; GCN-NEXT:    v_mov_b32_e32 v0, v14
+; GCN-NEXT:    v_mov_b32_e32 v1, v15
+; GCN-NEXT:    v_mov_b32_e32 v2, v16
+; GCN-NEXT:    v_mov_b32_e32 v3, v17
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -1647,7 +2107,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
@@ -1660,7 +2125,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4_
 ; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] cbsz:4 blgp:4
+; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 v[16:19], v[0:7], v[8:15], v[16:19] cbsz:4 blgp:4
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v16
+; GCN-NEXT:    v_mov_b32_e32 v1, v17
+; GCN-NEXT:    v_mov_b32_e32 v2, v18
+; GCN-NEXT:    v_mov_b32_e32 v3, v19
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 47ebd072c4cc7..3f96143231801 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -397,28 +397,29 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-NEXT:    v_mov_b32_e32 v9, v8
 ; CHECK-NEXT:    v_mov_b32_e32 v10, v8
 ; CHECK-NEXT:    v_mov_b32_e32 v11, v8
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19]
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v6
 ; CHECK-NEXT:    v_mov_b64_e32 v[0:1], 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11]
 ; CHECK-NEXT:    global_store_short v[0:1], v2, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[28:29], v[16:19]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[8:11]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[20:23], v[26:27], v[26:27], v[16:19]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[28:29], v[16:19]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[26:27], v[26:27], v[8:11]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[16:19]
 ; CHECK-NEXT:    s_nop 5
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v10, v6
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v10, v8
 ; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[6:9], v[26:27], v[26:27], v[12:15]
 ; CHECK-NEXT:    global_store_short v[0:1], v10, off
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[2:5]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[26:27], v[26:27], v[20:23]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
 ; CHECK-NEXT:    s_nop 1
 ; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; CHECK-NEXT:    global_store_short v[0:1], v6, off
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[26:27], v[26:27], v[20:23]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[26:27], v[16:19]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
@@ -426,16 +427,14 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[28:29], v[26:27], v[2:5]
-; CHECK-NEXT:    s_nop 6
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v2
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[2:5], v[30:31], v[26:27], v[16:19]
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v6, v16
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[30:31], v[26:27], v[2:5]
 ; CHECK-NEXT:    global_store_short v[0:1], v6, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
 ; CHECK-NEXT:    s_nop 2
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v2, v16
 ; CHECK-NEXT:    global_store_short v[0:1], v2, off
 ; CHECK-NEXT:    s_endpgm
 entry:
@@ -506,10 +505,10 @@ define void @test_rewrite_mfma_subreg_insert1(float %arg0, float %arg1, ptr addr
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[2:5]
+; CHECK-NEXT:    v_mfma_f32_4x4x1_16b_f32 v[2:5], v0, v1, v[2:5]
 ; CHECK-NEXT:    s_nop 3
-; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0]
-; CHECK-NEXT:    v_accvgpr_write_b32 a2, v3
+; CHECK-NEXT:    v_pk_mov_b32 v[0:1], v[2:3], v[4:5] op_sel:[1,0]
+; CHECK-NEXT:    v_accvgpr_write_b32 a2, v5
 ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
 ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
 ; CHECK-NEXT:    ;;#ASMSTART
@@ -688,11 +687,11 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64(double %arg0, d
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_and_b32_e32 v8, 0x3ff, v31
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 3, v8
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 3, v8
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1]
 ; CHECK-NEXT:    s_nop 5
-; CHECK-NEXT:    global_store_dwordx2 v[2:3], a[0:1], off
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], a[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %src2 = call double asm sideeffect "; def $0", "=a"()
@@ -712,13 +711,13 @@ define void @test_rewrite_mfma_copy_from_agpr_class_f64_4x4x4f64_chain(double %a
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    s_nop 0
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1]
-; CHECK-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[2:3]
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_lshl_add_u64 v[0:1], v[8:9], 0, v[0:1]
 ; CHECK-NEXT:    v_mfma_f64_4x4x4_4b_f64 a[0:1], v[4:5], v[6:7], a[0:1]
 ; CHECK-NEXT:    s_nop 8
-; CHECK-NEXT:    global_store_dwordx2 v[2:3], a[0:1], off
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], a[0:1], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %src2 = call double asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 20789232b1f25..504386bdb05a9 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -882,9 +882,9 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
 ; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mfma_i32_16x16x32_i8 v[2:5], v[2:3], v[2:3], v[4:7] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT:    v_mfma_i32_16x16x32_i8 v[4:7], v[2:3], v[2:3], v[4:7] cbsz:1 abid:2 blgp:3
 ; GFX942-NEXT:    s_nop 6
-; GFX942-NEXT:    global_store_dwordx4 v1, v[2:5], s[12:13]
+; GFX942-NEXT:    global_store_dwordx4 v1, v[4:7], s[12:13]
 ; GFX942-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()



More information about the llvm-commits mailing list