[llvm-branch-commits] [llvm] AMDGPU: Remove unnecessary AGPR operand legalization (PR #162093)

Matt Arsenault via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Oct 6 07:25:59 PDT 2025


https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/162093

The operands need to be correct to begin with, this doesn't
depend on the context of other operands. AV registers are not used
for the vdst/src2 registers.

>From c94ce69dd57855ac42b9881db9e7be1a3f5096ec Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sun, 5 Oct 2025 23:54:58 +0900
Subject: [PATCH] AMDGPU: Remove unnecessary AGPR operand legalization

The operands need to be correct to begin with, this doesn't
depend on the context of other operands. AV registers are not used
for the vdst/src2 registers.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  77 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll |  80 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 112 +--
 .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll    | 158 ++--
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 672 +++++++------
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       | 895 ++++++++----------
 .../AMDGPU/rewrite-vgpr-mfma-to-agpr.ll       |  77 +-
 7 files changed, 930 insertions(+), 1141 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 24a54e7c6e960..1a686a902727c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17346,75 +17346,24 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
 
   MachineFunction *MF = MI.getParent()->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
 
   if (TII->isVOP3(MI.getOpcode())) {
     // Make sure constant bus requirements are respected.
     TII->legalizeOperandsVOP3(MRI, MI);
 
-    // Prefer VGPRs over AGPRs in mAI instructions where possible.
-    // This saves a chain-copy of registers and better balance register
-    // use between vgpr and agpr as agpr tuples tend to be big.
-    if (!MI.getDesc().operands().empty()) {
-      unsigned Opc = MI.getOpcode();
-      bool HasAGPRs =
-          !Subtarget->hasGFX90AInsts() || Info->getMinNumAGPRs() != 0;
-      const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
-      int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
-      for (auto I :
-           {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
-            AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
-        if (I == -1)
-          break;
-        if ((I == Src2Idx) && (HasAGPRs))
-          break;
-        MachineOperand &Op = MI.getOperand(I);
-        if (!Op.isReg() || !Op.getReg().isVirtual())
-          continue;
-        auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
-        if (!TRI->hasAGPRs(RC))
-          continue;
-        auto *Src = MRI.getUniqueVRegDef(Op.getReg());
-        if (!Src || !Src->isCopy() ||
-            !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
-          continue;
-        auto *NewRC = TRI->getEquivalentVGPRClass(RC);
-        // All uses of agpr64 and agpr32 can also accept vgpr except for
-        // v_accvgpr_read, but we do not produce agpr reads during selection,
-        // so no use checks are needed.
-        MRI.setRegClass(Op.getReg(), NewRC);
-      }
-
-      if (TII->isMAI(MI)) {
-        // The ordinary src0, src1, src2 were legalized above.
-        //
-        // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
-        // as a separate instruction.
-        int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
-                                                 AMDGPU::OpName::scale_src0);
-        if (Src0Idx != -1) {
-          int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
-                                                   AMDGPU::OpName::scale_src1);
-          if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
-              TII->usesConstantBus(MRI, MI, Src1Idx))
-            TII->legalizeOpWithMove(MI, Src1Idx);
-        }
-      }
-
-      if (!HasAGPRs)
-        return;
-
-      // Resolve the rest of AV operands to AGPRs.
-      if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
-        if (Src2->isReg() && Src2->getReg().isVirtual()) {
-          auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
-          if (TRI->isVectorSuperClass(RC)) {
-            auto *NewRC = TRI->getEquivalentAGPRClass(RC);
-            MRI.setRegClass(Src2->getReg(), NewRC);
-            if (Src2->isTied())
-              MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
-          }
-        }
+    if (TII->isMAI(MI)) {
+      // The ordinary src0, src1, src2 were legalized above.
+      //
+      // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
+      // as a separate instruction.
+      int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                               AMDGPU::OpName::scale_src0);
+      if (Src0Idx != -1) {
+        int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                 AMDGPU::OpName::scale_src1);
+        if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
+            TII->usesConstantBus(MRI, MI, Src1Idx))
+          TII->legalizeOpWithMove(MI, Src1Idx);
       }
     }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 5ab8706f28f5f..22bc62acce15d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -726,12 +726,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0
+; GFX90A-VGPR-NEXT:    v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], 0
 ; GFX90A-VGPR-NEXT:    s_nop 3
-; GFX90A-VGPR-NEXT:    v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f64_4x4x4f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-VGPR-NEXT:    s_nop 7
 ; GFX90A-VGPR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -742,12 +742,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[6:7]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_4x4x4_4b_f64 v[4:5], v[0:1], v[2:3], 0
+; GFX942-VGPR-NEXT:    v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], 0
 ; GFX942-VGPR-NEXT:    s_nop 3
-; GFX942-VGPR-NEXT:    v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mfma_f64_4x4x4_4b_f64 v[0:1], v[2:3], v[4:5], v[0:1] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX942-VGPR-NEXT:    s_nop 7
 ; GFX942-VGPR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -765,10 +765,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX90A-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX90A-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v2, s10
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX90A-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s11
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s11
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a0, s0
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a1, s1
@@ -779,7 +779,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a6, s6
 ; GFX90A-NEXT:    v_accvgpr_write_b32 a7, s7
 ; GFX90A-NEXT:    s_nop 1
-; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_nop 15
 ; GFX90A-NEXT:    s_nop 0
@@ -792,10 +792,10 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX942-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-NEXT:    v_mov_b32_e32 v2, s10
+; GFX942-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX942-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v3, s11
-; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v1, s11
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    v_accvgpr_write_b32 a0, s0
 ; GFX942-NEXT:    v_accvgpr_write_b32 a1, s1
@@ -806,7 +806,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX942-NEXT:    v_accvgpr_write_b32 a6, s6
 ; GFX942-NEXT:    v_accvgpr_write_b32 a7, s7
 ; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT:    v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-NEXT:    s_nop 15
 ; GFX942-NEXT:    s_nop 0
@@ -819,17 +819,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX90A-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX90A-VGPR-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v10, s10
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, s10
 ; GFX90A-VGPR-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v11, s11
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v9, s11
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 0
@@ -842,17 +842,17 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
 ; GFX942-VGPR-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
 ; GFX942-VGPR-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s10
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, s10
 ; GFX942-VGPR-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s11
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v9, s11
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 0
@@ -1629,20 +1629,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v10, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v11, s3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
@@ -1657,20 +1657,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
@@ -1743,20 +1743,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX90A-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v10, s2
+; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v11, s3
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX90A-VGPR-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
 ; GFX90A-VGPR-NEXT:    s_nop 1
-; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX90A-VGPR-NEXT:    v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX90A-VGPR-NEXT:    s_nop 15
 ; GFX90A-VGPR-NEXT:    s_nop 1
 ; GFX90A-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
@@ -1771,20 +1771,20 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v1, 0x405ec000
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX942-VGPR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v12, s2
-; GFX942-VGPR-NEXT:    v_mov_b32_e32 v13, s3
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v10, s2
+; GFX942-VGPR-NEXT:    v_mov_b32_e32 v11, s3
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v4, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v6, v0
 ; GFX942-VGPR-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[8:9], v[6:7]
-; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[12:13], s[6:7]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[4:5], v[2:3]
 ; GFX942-VGPR-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
 ; GFX942-VGPR-NEXT:    s_nop 1
-; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
+; GFX942-VGPR-NEXT:    v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
 ; GFX942-VGPR-NEXT:    s_nop 15
 ; GFX942-VGPR-NEXT:    s_nop 1
 ; GFX942-VGPR-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index dc4c929124fec..2fb677eccc4b3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -1445,20 +1445,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[14:15]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
 ; GFX942-SDAG-NEXT:    s_nop 1
-; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
 ; GFX942-SDAG-NEXT:    s_nop 6
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[8:9]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1485,20 +1485,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX950-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[14:15]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x32_f16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
 ; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[8:9]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1577,11 +1577,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
 ; GFX942-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[22:23]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s24
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
@@ -1592,7 +1592,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
-; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
@@ -1635,11 +1635,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
 ; GFX950-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[20:21]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[22:23]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s24
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
@@ -1650,7 +1650,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_f16(ptr addrspace(1) %arg, <
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x16_f16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
@@ -1847,20 +1847,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
 ; GFX942-SDAG:       ; %bb.0: ; %bb
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[14:15]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
 ; GFX942-SDAG-NEXT:    s_nop 1
-; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
 ; GFX942-SDAG-NEXT:    s_nop 6
-; GFX942-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[8:9]
 ; GFX942-SDAG-NEXT:    s_endpgm
 ;
 ; GFX942-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1887,20 +1887,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
 ; GFX950-SDAG:       ; %bb.0: ; %bb
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX950-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x44
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v6, 0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v7, s6
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[12:13]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[14:15]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_smfmac_f32_16x16x32_bf16 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
 ; GFX950-SDAG-NEXT:    s_nop 7
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v6, v[8:11], s[8:9]
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v0, v[6:9], s[8:9]
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1979,11 +1979,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX942-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
 ; GFX942-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
 ; GFX942-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[20:21]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[22:23]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, s24
 ; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
@@ -1994,7 +1994,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
 ; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX942-SDAG-NEXT:    s_nop 1
-; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX942-SDAG-NEXT:    s_nop 9
 ; GFX942-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
@@ -2037,11 +2037,11 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX950-SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
 ; GFX950-SDAG-NEXT:    s_load_dword s24, s[4:5], 0x44
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
 ; GFX950-SDAG-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
-; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, s24
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[20:21]
+; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[22:23]
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s24
 ; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
@@ -2052,7 +2052,7 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x16_bf16(ptr addrspace(1) %arg,
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
 ; GFX950-SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
 ; GFX950-SDAG-NEXT:    s_nop 1
-; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-SDAG-NEXT:    v_smfmac_f32_32x32x16_bf16 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX950-SDAG-NEXT:    s_nop 10
 ; GFX950-SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 033a35f69a0bd..13a96cfa6e650 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -15,15 +15,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT:    v_mov_b64_e32 v[8:9], 48
-; GCN-NEXT:    v_mov_b64_e32 v[10:11], 32
-; GCN-NEXT:    v_mov_b64_e32 v[12:13], 16
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], 48
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], 32
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[28:29]
 ; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], s[30:31]
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, s10
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, s11
@@ -41,40 +41,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
 ; GCN-NEXT:    v_mov_b32_e32 v16, s16
 ; GCN-NEXT:    v_mov_b32_e32 v17, s17
-; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[16:31], v[8:11], v[12:15], a[0:15]
 ; GCN-NEXT:    v_mov_b32_e32 v18, s18
 ; GCN-NEXT:    v_mov_b32_e32 v19, s19
-; GCN-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NEXT:    v_mov_b32_e32 v1, s21
-; GCN-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NEXT:    v_mov_b32_e32 v3, s23
-; GCN-NEXT:    v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT:    v_mov_b32_e32 v8, s20
+; GCN-NEXT:    v_mov_b32_e32 v9, s21
+; GCN-NEXT:    v_mov_b32_e32 v10, s22
+; GCN-NEXT:    v_mov_b32_e32 v11, s23
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; GCN-NEXT:    s_nop 4
-; GCN-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    v_mov_b32_e32 v2, s10
 ; GCN-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -88,15 +87,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GCN-NEXT:    v_mov_b64_e32 v[8:9], 48
-; GCN-NEXT:    v_mov_b64_e32 v[10:11], 32
-; GCN-NEXT:    v_mov_b64_e32 v[12:13], 16
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], 48
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], 32
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[28:29]
 ; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], s[30:31]
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, s10
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, s11
@@ -114,40 +113,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, s23
 ; GCN-NEXT:    v_mov_b32_e32 v16, s16
 ; GCN-NEXT:    v_mov_b32_e32 v17, s17
-; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
 ; GCN-NEXT:    v_mov_b32_e32 v18, s18
 ; GCN-NEXT:    v_mov_b32_e32 v19, s19
-; GCN-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NEXT:    v_mov_b32_e32 v1, s21
-; GCN-NEXT:    v_mov_b32_e32 v2, s22
-; GCN-NEXT:    v_mov_b32_e32 v3, s23
-; GCN-NEXT:    v_mov_b64_e32 v[14:15], 0
+; GCN-NEXT:    v_mov_b32_e32 v8, s20
+; GCN-NEXT:    v_mov_b32_e32 v9, s21
+; GCN-NEXT:    v_mov_b32_e32 v10, s22
+; GCN-NEXT:    v_mov_b32_e32 v11, s23
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; GCN-NEXT:    s_nop 4
-; GCN-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-NEXT:    v_mov_b32_e32 v2, s10
 ; GCN-NEXT:    v_mov_b32_e32 v3, s11
-; GCN-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NEXT:    v_mov_b32_e32 v1, s13
 ; GCN-NEXT:    v_mov_b32_e32 v2, s14
 ; GCN-NEXT:    v_mov_b32_e32 v3, s15
-; GCN-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
@@ -250,13 +248,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
 ; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT:    v_mov_b32_e32 v44, 0
+; GCN-NEXT:    v_mov_b32_e32 v36, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
-; GCN-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
-; GCN-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; GCN-NEXT:    v_mov_b64_e32 v[40:41], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[38:39], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[44:45], s[30:31]
 ; GCN-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; GCN-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; GCN-NEXT:    v_mov_b64_e32 v[42:43], s[28:29]
 ; GCN-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; GCN-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; GCN-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -264,41 +262,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
 ; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; GCN-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v40, s20
-; GCN-NEXT:    v_mov_b32_e32 v41, s21
-; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
-; GCN-NEXT:    v_mov_b32_e32 v42, s22
-; GCN-NEXT:    v_mov_b32_e32 v43, s23
-; GCN-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s20
+; GCN-NEXT:    v_mov_b32_e32 v33, s21
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[38:41], v[42:45], v[16:31]
+; GCN-NEXT:    v_mov_b32_e32 v34, s22
+; GCN-NEXT:    v_mov_b32_e32 v35, s23
+; GCN-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 2
 ; GCN-NEXT:    v_mov_b32_e32 v16, s16
 ; GCN-NEXT:    v_mov_b32_e32 v17, s17
 ; GCN-NEXT:    v_mov_b32_e32 v18, s18
 ; GCN-NEXT:    v_mov_b32_e32 v19, s19
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v16, s12
 ; GCN-NEXT:    v_mov_b32_e32 v17, s13
 ; GCN-NEXT:    v_mov_b32_e32 v18, s14
 ; GCN-NEXT:    v_mov_b32_e32 v19, s15
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v16, s8
 ; GCN-NEXT:    v_mov_b32_e32 v17, s9
 ; GCN-NEXT:    v_mov_b32_e32 v18, s10
 ; GCN-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
@@ -313,13 +311,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
 ; GCN-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GCN-NEXT:    v_mov_b32_e32 v44, 0
+; GCN-NEXT:    v_mov_b32_e32 v36, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
-; GCN-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
-; GCN-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; GCN-NEXT:    v_mov_b64_e32 v[40:41], s[26:27]
+; GCN-NEXT:    v_mov_b64_e32 v[38:39], s[24:25]
+; GCN-NEXT:    v_mov_b64_e32 v[44:45], s[30:31]
 ; GCN-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; GCN-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; GCN-NEXT:    v_mov_b64_e32 v[42:43], s[28:29]
 ; GCN-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; GCN-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; GCN-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -327,41 +325,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
 ; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; GCN-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v40, s20
-; GCN-NEXT:    v_mov_b32_e32 v41, s21
-; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; GCN-NEXT:    v_mov_b32_e32 v42, s22
-; GCN-NEXT:    v_mov_b32_e32 v43, s23
-; GCN-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT:    v_mov_b32_e32 v32, s20
+; GCN-NEXT:    v_mov_b32_e32 v33, s21
+; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3
+; GCN-NEXT:    v_mov_b32_e32 v34, s22
+; GCN-NEXT:    v_mov_b32_e32 v35, s23
+; GCN-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 2
 ; GCN-NEXT:    v_mov_b32_e32 v16, s16
 ; GCN-NEXT:    v_mov_b32_e32 v17, s17
 ; GCN-NEXT:    v_mov_b32_e32 v18, s18
 ; GCN-NEXT:    v_mov_b32_e32 v19, s19
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v16, s12
 ; GCN-NEXT:    v_mov_b32_e32 v17, s13
 ; GCN-NEXT:    v_mov_b32_e32 v18, s14
 ; GCN-NEXT:    v_mov_b32_e32 v19, s15
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    v_mov_b32_e32 v16, s8
 ; GCN-NEXT:    v_mov_b32_e32 v17, s9
 ; GCN-NEXT:    v_mov_b32_e32 v18, s10
 ; GCN-NEXT:    v_mov_b32_e32 v19, s11
-; GCN-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; GCN-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_endpgm
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 753206206180a..ab0000f6831b6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -141,18 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3]
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -179,18 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v4, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3]
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -198,18 +198,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
 ; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3]
 ; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
 ; AGPR:       ; %bb.0:
@@ -260,18 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    v_mov_b32_e32 v12, 0
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -298,18 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v4, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -317,18 +317,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
 ; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_f16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
 ; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
 ; AGPR:       ; %bb.0:
@@ -382,15 +382,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[24:25]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[26:27]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[28:29]
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[30:31]
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
@@ -408,40 +408,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
+; SDAG-NEXT:    v_mov_b32_e32 v9, s21
+; SDAG-NEXT:    v_mov_b32_e32 v10, s22
+; SDAG-NEXT:    v_mov_b32_e32 v11, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; SDAG-NEXT:    s_nop 4
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s10
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -508,15 +507,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[28:29]
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], s[30:31]
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
@@ -534,40 +533,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
 ; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15]
 ; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
 ; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
-; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s23
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; HEURRC-NEXT:    s_nop 4
-; HEURRC-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
 ; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
 ; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
-; HEURRC-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
 ; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
 ; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
-; HEURRC-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_endpgm
 ;
@@ -575,15 +573,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT:    v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT:    v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], 16
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[42:43], s[26:27]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[40:41], s[24:25]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], s[30:31]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], s[28:29]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -593,40 +591,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; VGPRRC-NEXT:    v_mov_b32_e32 v48, s16
 ; VGPRRC-NEXT:    v_mov_b32_e32 v49, s17
-; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[44:47], v[0:15]
 ; VGPRRC-NEXT:    v_mov_b32_e32 v50, s18
 ; VGPRRC-NEXT:    v_mov_b32_e32 v51, s19
-; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], 0
 ; VGPRRC-NEXT:    s_nop 8
-; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s20
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s21
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[48:51], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_32x32x16_f16:
@@ -765,15 +763,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], 48
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[24:25]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[26:27]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[28:29]
 ; SDAG-NEXT:    v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[30:31]
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, s9
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, s10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, s11
@@ -791,40 +789,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, s23
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s17
-; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], 0
+; SDAG-NEXT:    v_mov_b32_e32 v8, s20
+; SDAG-NEXT:    v_mov_b32_e32 v9, s21
+; SDAG-NEXT:    v_mov_b32_e32 v10, s22
+; SDAG-NEXT:    v_mov_b32_e32 v11, s23
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; SDAG-NEXT:    s_nop 4
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s10
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s12
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v3, s15
-; SDAG-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -891,15 +888,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; HEURRC:       ; %bb.0:
 ; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], 48
-; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], 32
-; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], 16
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], 48
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], 32
+; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[28:29]
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[28:29]
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[30:31]
+; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], s[30:31]
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a1, s9
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a2, s10
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a3, s11
@@ -917,40 +914,39 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a15, s23
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
 ; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
-; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 a[16:31], v[8:11], v[12:15], a[0:15] cbsz:2 abid:3 blgp:1
 ; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
 ; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    v_mov_b32_e32 v0, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v1, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v2, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v3, s23
-; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], 0
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s21
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s23
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; HEURRC-NEXT:    s_nop 4
-; HEURRC-NEXT:    global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[0:1], a[28:31], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[2:3], a[24:27], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[4:5], a[20:23], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[6:7], a[16:19], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[2:3], v[16:19], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s8
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s9
 ; HEURRC-NEXT:    v_mov_b32_e32 v2, s10
 ; HEURRC-NEXT:    v_mov_b32_e32 v3, s11
-; HEURRC-NEXT:    global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v0, s12
 ; HEURRC-NEXT:    v_mov_b32_e32 v1, s13
 ; HEURRC-NEXT:    v_mov_b32_e32 v2, s14
 ; HEURRC-NEXT:    v_mov_b32_e32 v3, s15
-; HEURRC-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_endpgm
 ;
@@ -958,15 +954,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; VGPRRC:       ; %bb.0:
 ; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT:    v_mov_b64_e32 v[40:41], 48
-; VGPRRC-NEXT:    v_mov_b64_e32 v[42:43], 32
-; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], 16
+; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], 48
+; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], 32
+; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], 16
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[42:43], s[26:27]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[40:41], s[24:25]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], s[30:31]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], s[28:29]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -976,40 +972,40 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; VGPRRC-NEXT:    v_mov_b32_e32 v48, s16
 ; VGPRRC-NEXT:    v_mov_b32_e32 v49, s17
-; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[16:31], v[40:43], v[44:47], v[0:15] cbsz:2 abid:3 blgp:1
 ; VGPRRC-NEXT:    v_mov_b32_e32 v50, s18
 ; VGPRRC-NEXT:    v_mov_b32_e32 v51, s19
-; VGPRRC-NEXT:    v_mov_b64_e32 v[46:47], 0
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], 0
 ; VGPRRC-NEXT:    s_nop 8
-; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[24:27], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[20:23], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[16:19], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s20
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s21
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s22
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[34:35], v[48:51], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[38:39], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v0, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v1, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v2, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v3, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v[36:37], v[0:3], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags:
@@ -1489,13 +1485,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT:    v_mov_b32_e32 v44, 0
+; SDAG-NEXT:    v_mov_b32_e32 v36, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
-; SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
-; SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[26:27]
+; SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[24:25]
+; SDAG-NEXT:    v_mov_b64_e32 v[44:45], s[30:31]
 ; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[28:29]
 ; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -1503,41 +1499,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; SDAG-NEXT:    v_mov_b32_e32 v40, s20
-; SDAG-NEXT:    v_mov_b32_e32 v41, s21
-; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
-; SDAG-NEXT:    v_mov_b32_e32 v42, s22
-; SDAG-NEXT:    v_mov_b32_e32 v43, s23
-; SDAG-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31]
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s12
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s10
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -1592,13 +1588,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT:    v_mov_b32_e32 v44, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v36, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
-; HEURRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
-; HEURRC-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; HEURRC-NEXT:    v_mov_b64_e32 v[40:41], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[38:39], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[44:45], s[30:31]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; HEURRC-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; HEURRC-NEXT:    v_mov_b64_e32 v[42:43], s[28:29]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -1606,41 +1602,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; HEURRC-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; HEURRC-NEXT:    v_mov_b32_e32 v40, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v41, s21
-; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
-; HEURRC-NEXT:    v_mov_b32_e32 v42, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v43, s23
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31]
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 2
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
 ; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
 ; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
 ; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
 ; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
 ; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
 ; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
 ; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
 ; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
 ; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_endpgm
 ;
@@ -1649,13 +1645,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT:    v_mov_b32_e32 v44, 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v36, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[40:41], s[26:27]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], s[24:25]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], s[30:31]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[42:43], s[28:29]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -1663,41 +1659,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT:    v_mov_b32_e32 v40, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v41, s21
-; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT:    v_mov_b32_e32 v42, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v43, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31]
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 2
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
 ; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
 ; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
 ; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
@@ -1831,13 +1827,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT:    v_mov_b32_e32 v44, 0
+; SDAG-NEXT:    v_mov_b32_e32 v36, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
-; SDAG-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
-; SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT:    v_mov_b64_e32 v[40:41], s[26:27]
+; SDAG-NEXT:    v_mov_b64_e32 v[38:39], s[24:25]
+; SDAG-NEXT:    v_mov_b64_e32 v[44:45], s[30:31]
 ; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; SDAG-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT:    v_mov_b64_e32 v[42:43], s[28:29]
 ; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -1845,41 +1841,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; SDAG-NEXT:    v_mov_b32_e32 v40, s20
-; SDAG-NEXT:    v_mov_b32_e32 v41, s21
-; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT:    v_mov_b32_e32 v42, s22
-; SDAG-NEXT:    v_mov_b32_e32 v43, s23
-; SDAG-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s12
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s13
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s14
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s8
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s9
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s10
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_endpgm
 ;
@@ -1934,13 +1930,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT:    v_mov_b32_e32 v44, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v36, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
-; HEURRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
-; HEURRC-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; HEURRC-NEXT:    v_mov_b64_e32 v[40:41], s[26:27]
+; HEURRC-NEXT:    v_mov_b64_e32 v[38:39], s[24:25]
+; HEURRC-NEXT:    v_mov_b64_e32 v[44:45], s[30:31]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; HEURRC-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; HEURRC-NEXT:    v_mov_b64_e32 v[42:43], s[28:29]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -1948,41 +1944,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; HEURRC-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; HEURRC-NEXT:    v_mov_b32_e32 v40, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v41, s21
-; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT:    v_mov_b32_e32 v42, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v43, s23
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
+; HEURRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 2
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, s16
 ; HEURRC-NEXT:    v_mov_b32_e32 v17, s17
 ; HEURRC-NEXT:    v_mov_b32_e32 v18, s18
 ; HEURRC-NEXT:    v_mov_b32_e32 v19, s19
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, s12
 ; HEURRC-NEXT:    v_mov_b32_e32 v17, s13
 ; HEURRC-NEXT:    v_mov_b32_e32 v18, s14
 ; HEURRC-NEXT:    v_mov_b32_e32 v19, s15
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_nop 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, s8
 ; HEURRC-NEXT:    v_mov_b32_e32 v17, s9
 ; HEURRC-NEXT:    v_mov_b32_e32 v18, s10
 ; HEURRC-NEXT:    v_mov_b32_e32 v19, s11
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
-; HEURRC-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
 ; HEURRC-NEXT:    s_waitcnt vmcnt(0)
 ; HEURRC-NEXT:    s_endpgm
 ;
@@ -1991,13 +1987,13 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
 ; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT:    v_mov_b32_e32 v44, 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v36, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], s[26:27]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], s[24:25]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], s[30:31]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[40:41], s[26:27]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[38:39], s[24:25]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[44:45], s[30:31]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[36:37], s[28:29]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[42:43], s[28:29]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -2005,41 +2001,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT:    v_mov_b32_e32 v40, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v41, s21
-; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT:    v_mov_b32_e32 v42, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v43, s23
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT:    v_mfma_f32_32x32x16_f16 v[0:15], v[38:41], v[42:45], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[32:35], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 2
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, s16
 ; VGPRRC-NEXT:    v_mov_b32_e32 v17, s17
 ; VGPRRC-NEXT:    v_mov_b32_e32 v18, s18
 ; VGPRRC-NEXT:    v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, s12
 ; VGPRRC-NEXT:    v_mov_b32_e32 v17, s13
 ; VGPRRC-NEXT:    v_mov_b32_e32 v18, s14
 ; VGPRRC-NEXT:    v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_nop 0
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, s8
 ; VGPRRC-NEXT:    v_mov_b32_e32 v17, s9
 ; VGPRRC-NEXT:    v_mov_b32_e32 v18, s10
 ; VGPRRC-NEXT:    v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[16:19], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[8:11], s[0:1] offset:32 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[12:15], s[0:1] offset:48 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[0:3], s[0:1] sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
-; VGPRRC-NEXT:    global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT:    global_store_dwordx4 v36, v[4:7], s[0:1] offset:16 sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
@@ -5425,18 +5421,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3]
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GCN-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5444,18 +5440,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v4, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3]
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5463,18 +5459,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
 ; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3]
 ; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
 ; AGPR:       ; %bb.0:
@@ -5525,18 +5521,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GCN-NEXT:    s_endpgm
 ;
 ; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5544,18 +5540,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; HEURRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; HEURRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; HEURRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT:    v_mov_b32_e32 v12, 0
+; HEURRC-NEXT:    v_mov_b32_e32 v4, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; HEURRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; HEURRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    s_nop 7
-; HEURRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; HEURRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; HEURRC-NEXT:    s_endpgm
 ;
 ; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5563,18 +5559,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
 ; VGPRRC-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
 ; VGPRRC-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
 ; VGPRRC-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; VGPRRC-NEXT:    v_mov_b32_e32 v12, 0
+; VGPRRC-NEXT:    v_mov_b32_e32 v4, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[2:3]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT:    v_mfma_f32_16x16x32_bf16 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
 ; VGPRRC-NEXT:    s_nop 7
-; VGPRRC-NEXT:    global_store_dwordx4 v12, v[0:3], s[6:7]
+; VGPRRC-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; VGPRRC-NEXT:    s_endpgm
 ; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
 ; AGPR:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 6eb9449069a52..ee11b9295a24a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -17,24 +17,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
+; SDAG-NEXT:    v_mov_b32_e32 v4, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[0:1]
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_mov_b32_e32 v17, s16
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; SDAG-NEXT:    v_mov_b32_e32 v5, s16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
 ; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
@@ -120,30 +120,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; SDAG-NEXT:    v_mov_b32_e32 v11, s1
-; SDAG-NEXT:    v_mov_b32_e32 v12, s2
-; SDAG-NEXT:    v_mov_b32_e32 v13, s3
-; SDAG-NEXT:    v_mov_b32_e32 v2, s16
-; SDAG-NEXT:    v_mov_b32_e32 v3, s17
-; SDAG-NEXT:    v_mov_b32_e32 v4, s18
-; SDAG-NEXT:    v_mov_b32_e32 v5, s19
-; SDAG-NEXT:    v_mov_b32_e32 v6, s20
-; SDAG-NEXT:    v_mov_b32_e32 v7, s21
-; SDAG-NEXT:    v_mov_b32_e32 v8, s22
-; SDAG-NEXT:    v_mov_b32_e32 v9, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v0, s28
+; SDAG-NEXT:    v_mov_b32_e32 v14, s0
+; SDAG-NEXT:    v_mov_b32_e32 v15, s1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s2
+; SDAG-NEXT:    v_mov_b32_e32 v17, s3
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s24
+; SDAG-NEXT:    v_mov_b32_e32 v1, s25
+; SDAG-NEXT:    v_mov_b32_e32 v2, s26
+; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[10:13], v[2:9], v0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v4
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
@@ -187,17 +182,17 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
 ; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
 ; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
 ; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
-; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
+; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
+; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
-; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
+; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
+; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[12:13]
+; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[10:11]
+; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
@@ -436,53 +431,37 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, s0
+; SDAG-NEXT:    v_mov_b32_e32 v27, s1
+; SDAG-NEXT:    v_mov_b32_e32 v28, s2
+; SDAG-NEXT:    v_mov_b32_e32 v29, s3
+; SDAG-NEXT:    v_mov_b32_e32 v16, v10
+; SDAG-NEXT:    v_mov_b32_e32 v15, v9
+; SDAG-NEXT:    v_mov_b32_e32 v14, v8
+; SDAG-NEXT:    v_mov_b32_e32 v13, v7
+; SDAG-NEXT:    v_mov_b32_e32 v12, v6
+; SDAG-NEXT:    v_mov_b32_e32 v11, v5
+; SDAG-NEXT:    v_mov_b32_e32 v10, v4
+; SDAG-NEXT:    v_mov_b32_e32 v9, v3
+; SDAG-NEXT:    v_mov_b32_e32 v8, v2
+; SDAG-NEXT:    v_mov_b32_e32 v7, v1
+; SDAG-NEXT:    v_mov_b32_e32 v6, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s24
+; SDAG-NEXT:    v_mov_b32_e32 v1, s25
+; SDAG-NEXT:    v_mov_b32_e32 v2, s26
+; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
+; SDAG-NEXT:    v_mov_b32_e32 v5, s29
+; SDAG-NEXT:    v_mov_b32_e32 v18, s16
+; SDAG-NEXT:    v_mov_b32_e32 v19, s17
+; SDAG-NEXT:    v_mov_b32_e32 v20, s18
+; SDAG-NEXT:    v_mov_b32_e32 v21, s19
+; SDAG-NEXT:    v_mov_b32_e32 v22, s20
+; SDAG-NEXT:    v_mov_b32_e32 v23, s21
+; SDAG-NEXT:    v_mov_b32_e32 v24, s22
+; SDAG-NEXT:    v_mov_b32_e32 v25, s23
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
@@ -541,24 +520,24 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[6:7]
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
 ; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
-; GCN-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
-; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], s[0:1]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT:    v_mov_b32_e32 v17, s16
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[8:9], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[10:11], s[12:13]
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[14:15]
+; GCN-NEXT:    v_mov_b32_e32 v5, s16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v5 cbsz:1 abid:2
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GCN-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
@@ -618,30 +597,25 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v10, s0
-; GCN-NEXT:    v_mov_b32_e32 v11, s1
-; GCN-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-NEXT:    v_mov_b32_e32 v2, s16
-; GCN-NEXT:    v_mov_b32_e32 v3, s17
-; GCN-NEXT:    v_mov_b32_e32 v4, s18
-; GCN-NEXT:    v_mov_b32_e32 v5, s19
-; GCN-NEXT:    v_mov_b32_e32 v6, s20
-; GCN-NEXT:    v_mov_b32_e32 v7, s21
-; GCN-NEXT:    v_mov_b32_e32 v8, s22
-; GCN-NEXT:    v_mov_b32_e32 v9, s23
-; GCN-NEXT:    v_accvgpr_write_b32 a0, s24
-; GCN-NEXT:    v_accvgpr_write_b32 a1, s25
-; GCN-NEXT:    v_accvgpr_write_b32 a2, s26
-; GCN-NEXT:    v_accvgpr_write_b32 a3, s27
-; GCN-NEXT:    v_mov_b32_e32 v0, s28
+; GCN-NEXT:    v_mov_b32_e32 v14, s0
+; GCN-NEXT:    v_mov_b32_e32 v15, s1
+; GCN-NEXT:    v_mov_b32_e32 v16, s2
+; GCN-NEXT:    v_mov_b32_e32 v17, s3
+; GCN-NEXT:    v_mov_b32_e32 v6, s16
+; GCN-NEXT:    v_mov_b32_e32 v7, s17
+; GCN-NEXT:    v_mov_b32_e32 v8, s18
+; GCN-NEXT:    v_mov_b32_e32 v9, s19
+; GCN-NEXT:    v_mov_b32_e32 v10, s20
+; GCN-NEXT:    v_mov_b32_e32 v11, s21
+; GCN-NEXT:    v_mov_b32_e32 v12, s22
+; GCN-NEXT:    v_mov_b32_e32 v13, s23
+; GCN-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-NEXT:    v_mov_b32_e32 v3, s27
+; GCN-NEXT:    v_mov_b32_e32 v4, s28
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[10:13], v[2:9], v0
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v4
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
@@ -667,17 +641,17 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
 ; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
 ; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
-; GCN-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
-; GCN-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
+; GCN-NEXT:    v_mov_b64_e32 v[28:29], s[2:3]
+; GCN-NEXT:    v_mov_b64_e32 v[26:27], s[0:1]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
-; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
-; GCN-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
-; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v28, s16
+; GCN-NEXT:    v_mov_b64_e32 v[24:25], s[14:15]
+; GCN-NEXT:    v_mov_b64_e32 v[22:23], s[12:13]
+; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v16, s16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16 cbsz:1 abid:2
 ; GCN-NEXT:    v_mov_b32_e32 v16, 0
 ; GCN-NEXT:    s_nop 10
 ; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
@@ -779,53 +753,37 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v36, s0
-; GCN-NEXT:    v_mov_b32_e32 v37, s1
-; GCN-NEXT:    v_mov_b32_e32 v38, s2
-; GCN-NEXT:    v_mov_b32_e32 v39, s3
-; GCN-NEXT:    v_mov_b32_e32 v13, s25
-; GCN-NEXT:    v_mov_b32_e32 v14, s26
-; GCN-NEXT:    v_mov_b32_e32 v15, s27
-; GCN-NEXT:    v_mov_b32_e32 v16, s28
-; GCN-NEXT:    v_mov_b32_e32 v17, s29
-; GCN-NEXT:    v_mov_b32_e32 v28, s16
-; GCN-NEXT:    v_mov_b32_e32 v29, s17
-; GCN-NEXT:    v_mov_b32_e32 v30, s18
-; GCN-NEXT:    v_mov_b32_e32 v31, s19
-; GCN-NEXT:    v_mov_b32_e32 v32, s20
-; GCN-NEXT:    v_mov_b32_e32 v33, s21
-; GCN-NEXT:    v_mov_b32_e32 v34, s22
-; GCN-NEXT:    v_mov_b32_e32 v35, s23
-; GCN-NEXT:    v_mov_b32_e32 v12, s24
-; GCN-NEXT:    v_mov_b32_e32 v18, v0
-; GCN-NEXT:    v_mov_b32_e32 v19, v1
-; GCN-NEXT:    v_mov_b32_e32 v20, v2
-; GCN-NEXT:    v_mov_b32_e32 v21, v3
-; GCN-NEXT:    v_mov_b32_e32 v22, v4
-; GCN-NEXT:    v_mov_b32_e32 v23, v5
-; GCN-NEXT:    v_mov_b32_e32 v24, v6
-; GCN-NEXT:    v_mov_b32_e32 v25, v7
-; GCN-NEXT:    v_mov_b32_e32 v26, v8
-; GCN-NEXT:    v_mov_b32_e32 v27, v9
+; GCN-NEXT:    v_mov_b32_e32 v26, s0
+; GCN-NEXT:    v_mov_b32_e32 v27, s1
+; GCN-NEXT:    v_mov_b32_e32 v28, s2
+; GCN-NEXT:    v_mov_b32_e32 v29, s3
+; GCN-NEXT:    v_mov_b32_e32 v16, v10
+; GCN-NEXT:    v_mov_b32_e32 v15, v9
+; GCN-NEXT:    v_mov_b32_e32 v14, v8
+; GCN-NEXT:    v_mov_b32_e32 v13, v7
+; GCN-NEXT:    v_mov_b32_e32 v12, v6
+; GCN-NEXT:    v_mov_b32_e32 v11, v5
+; GCN-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-NEXT:    v_mov_b32_e32 v9, v3
+; GCN-NEXT:    v_mov_b32_e32 v8, v2
+; GCN-NEXT:    v_mov_b32_e32 v7, v1
+; GCN-NEXT:    v_mov_b32_e32 v6, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-NEXT:    v_mov_b32_e32 v3, s27
+; GCN-NEXT:    v_mov_b32_e32 v4, s28
+; GCN-NEXT:    v_mov_b32_e32 v5, s29
+; GCN-NEXT:    v_mov_b32_e32 v18, s16
+; GCN-NEXT:    v_mov_b32_e32 v19, s17
+; GCN-NEXT:    v_mov_b32_e32 v20, s18
+; GCN-NEXT:    v_mov_b32_e32 v21, s19
+; GCN-NEXT:    v_mov_b32_e32 v22, s20
+; GCN-NEXT:    v_mov_b32_e32 v23, s21
+; GCN-NEXT:    v_mov_b32_e32 v24, s22
+; GCN-NEXT:    v_mov_b32_e32 v25, s23
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
-; GCN-NEXT:    s_nop 11
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -953,30 +911,25 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; SDAG-NEXT:    v_mov_b32_e32 v11, s1
-; SDAG-NEXT:    v_mov_b32_e32 v12, s2
-; SDAG-NEXT:    v_mov_b32_e32 v13, s3
-; SDAG-NEXT:    v_mov_b32_e32 v2, s16
-; SDAG-NEXT:    v_mov_b32_e32 v3, s17
-; SDAG-NEXT:    v_mov_b32_e32 v4, s18
-; SDAG-NEXT:    v_mov_b32_e32 v5, s19
-; SDAG-NEXT:    v_mov_b32_e32 v6, s20
-; SDAG-NEXT:    v_mov_b32_e32 v7, s21
-; SDAG-NEXT:    v_mov_b32_e32 v8, s22
-; SDAG-NEXT:    v_mov_b32_e32 v9, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v0, s28
+; SDAG-NEXT:    v_mov_b32_e32 v14, s0
+; SDAG-NEXT:    v_mov_b32_e32 v15, s1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s2
+; SDAG-NEXT:    v_mov_b32_e32 v17, s3
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s24
+; SDAG-NEXT:    v_mov_b32_e32 v1, s25
+; SDAG-NEXT:    v_mov_b32_e32 v2, s26
+; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[10:13], v[2:9], v0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 v[0:3], v[14:17], v[6:13], v4
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
@@ -1275,53 +1228,37 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, s0
+; SDAG-NEXT:    v_mov_b32_e32 v27, s1
+; SDAG-NEXT:    v_mov_b32_e32 v28, s2
+; SDAG-NEXT:    v_mov_b32_e32 v29, s3
+; SDAG-NEXT:    v_mov_b32_e32 v16, v10
+; SDAG-NEXT:    v_mov_b32_e32 v15, v9
+; SDAG-NEXT:    v_mov_b32_e32 v14, v8
+; SDAG-NEXT:    v_mov_b32_e32 v13, v7
+; SDAG-NEXT:    v_mov_b32_e32 v12, v6
+; SDAG-NEXT:    v_mov_b32_e32 v11, v5
+; SDAG-NEXT:    v_mov_b32_e32 v10, v4
+; SDAG-NEXT:    v_mov_b32_e32 v9, v3
+; SDAG-NEXT:    v_mov_b32_e32 v8, v2
+; SDAG-NEXT:    v_mov_b32_e32 v7, v1
+; SDAG-NEXT:    v_mov_b32_e32 v6, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s24
+; SDAG-NEXT:    v_mov_b32_e32 v1, s25
+; SDAG-NEXT:    v_mov_b32_e32 v2, s26
+; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
+; SDAG-NEXT:    v_mov_b32_e32 v5, s29
+; SDAG-NEXT:    v_mov_b32_e32 v18, s16
+; SDAG-NEXT:    v_mov_b32_e32 v19, s17
+; SDAG-NEXT:    v_mov_b32_e32 v20, s18
+; SDAG-NEXT:    v_mov_b32_e32 v21, s19
+; SDAG-NEXT:    v_mov_b32_e32 v22, s20
+; SDAG-NEXT:    v_mov_b32_e32 v23, s21
+; SDAG-NEXT:    v_mov_b32_e32 v24, s22
+; SDAG-NEXT:    v_mov_b32_e32 v25, s23
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
@@ -1489,30 +1426,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; SDAG-NEXT:    v_mov_b32_e32 v11, s1
-; SDAG-NEXT:    v_mov_b32_e32 v12, s2
-; SDAG-NEXT:    v_mov_b32_e32 v13, s3
-; SDAG-NEXT:    v_mov_b32_e32 v2, s16
-; SDAG-NEXT:    v_mov_b32_e32 v3, s17
-; SDAG-NEXT:    v_mov_b32_e32 v4, s18
-; SDAG-NEXT:    v_mov_b32_e32 v5, s19
-; SDAG-NEXT:    v_mov_b32_e32 v6, s20
-; SDAG-NEXT:    v_mov_b32_e32 v7, s21
-; SDAG-NEXT:    v_mov_b32_e32 v8, s22
-; SDAG-NEXT:    v_mov_b32_e32 v9, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v0, s28
+; SDAG-NEXT:    v_mov_b32_e32 v14, s0
+; SDAG-NEXT:    v_mov_b32_e32 v15, s1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s2
+; SDAG-NEXT:    v_mov_b32_e32 v17, s3
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s24
+; SDAG-NEXT:    v_mov_b32_e32 v1, s25
+; SDAG-NEXT:    v_mov_b32_e32 v2, s26
+; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[10:13], v[2:9], v0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[14:17], v[6:13], v4
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
@@ -1658,30 +1590,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; SDAG-NEXT:    v_mov_b32_e32 v11, s1
-; SDAG-NEXT:    v_mov_b32_e32 v12, s2
-; SDAG-NEXT:    v_mov_b32_e32 v13, s3
-; SDAG-NEXT:    v_mov_b32_e32 v2, s16
-; SDAG-NEXT:    v_mov_b32_e32 v3, s17
-; SDAG-NEXT:    v_mov_b32_e32 v4, s18
-; SDAG-NEXT:    v_mov_b32_e32 v5, s19
-; SDAG-NEXT:    v_mov_b32_e32 v6, s20
-; SDAG-NEXT:    v_mov_b32_e32 v7, s21
-; SDAG-NEXT:    v_mov_b32_e32 v8, s22
-; SDAG-NEXT:    v_mov_b32_e32 v9, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v0, s28
+; SDAG-NEXT:    v_mov_b32_e32 v14, s0
+; SDAG-NEXT:    v_mov_b32_e32 v15, s1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s2
+; SDAG-NEXT:    v_mov_b32_e32 v17, s3
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s24
+; SDAG-NEXT:    v_mov_b32_e32 v1, s25
+; SDAG-NEXT:    v_mov_b32_e32 v2, s26
+; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[10:13], v[2:9], v0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[14:17], v[6:13], v4
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
@@ -1827,30 +1754,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; SDAG-NEXT:    v_mov_b32_e32 v11, s1
-; SDAG-NEXT:    v_mov_b32_e32 v12, s2
-; SDAG-NEXT:    v_mov_b32_e32 v13, s3
-; SDAG-NEXT:    v_mov_b32_e32 v2, s16
-; SDAG-NEXT:    v_mov_b32_e32 v3, s17
-; SDAG-NEXT:    v_mov_b32_e32 v4, s18
-; SDAG-NEXT:    v_mov_b32_e32 v5, s19
-; SDAG-NEXT:    v_mov_b32_e32 v6, s20
-; SDAG-NEXT:    v_mov_b32_e32 v7, s21
-; SDAG-NEXT:    v_mov_b32_e32 v8, s22
-; SDAG-NEXT:    v_mov_b32_e32 v9, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v0, s28
+; SDAG-NEXT:    v_mov_b32_e32 v14, s0
+; SDAG-NEXT:    v_mov_b32_e32 v15, s1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s2
+; SDAG-NEXT:    v_mov_b32_e32 v17, s3
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s24
+; SDAG-NEXT:    v_mov_b32_e32 v1, s25
+; SDAG-NEXT:    v_mov_b32_e32 v2, s26
+; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[10:13], v[2:9], v0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[14:17], v[6:13], v4
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
@@ -1996,30 +1918,25 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v10, s0
-; SDAG-NEXT:    v_mov_b32_e32 v11, s1
-; SDAG-NEXT:    v_mov_b32_e32 v12, s2
-; SDAG-NEXT:    v_mov_b32_e32 v13, s3
-; SDAG-NEXT:    v_mov_b32_e32 v2, s16
-; SDAG-NEXT:    v_mov_b32_e32 v3, s17
-; SDAG-NEXT:    v_mov_b32_e32 v4, s18
-; SDAG-NEXT:    v_mov_b32_e32 v5, s19
-; SDAG-NEXT:    v_mov_b32_e32 v6, s20
-; SDAG-NEXT:    v_mov_b32_e32 v7, s21
-; SDAG-NEXT:    v_mov_b32_e32 v8, s22
-; SDAG-NEXT:    v_mov_b32_e32 v9, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v0, s28
+; SDAG-NEXT:    v_mov_b32_e32 v14, s0
+; SDAG-NEXT:    v_mov_b32_e32 v15, s1
+; SDAG-NEXT:    v_mov_b32_e32 v16, s2
+; SDAG-NEXT:    v_mov_b32_e32 v17, s3
+; SDAG-NEXT:    v_mov_b32_e32 v6, s16
+; SDAG-NEXT:    v_mov_b32_e32 v7, s17
+; SDAG-NEXT:    v_mov_b32_e32 v8, s18
+; SDAG-NEXT:    v_mov_b32_e32 v9, s19
+; SDAG-NEXT:    v_mov_b32_e32 v10, s20
+; SDAG-NEXT:    v_mov_b32_e32 v11, s21
+; SDAG-NEXT:    v_mov_b32_e32 v12, s22
+; SDAG-NEXT:    v_mov_b32_e32 v13, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s24
+; SDAG-NEXT:    v_mov_b32_e32 v1, s25
+; SDAG-NEXT:    v_mov_b32_e32 v2, s26
+; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[10:13], v[2:9], v0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[14:17], v[6:13], v4
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
@@ -2318,53 +2235,37 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, s0
+; SDAG-NEXT:    v_mov_b32_e32 v27, s1
+; SDAG-NEXT:    v_mov_b32_e32 v28, s2
+; SDAG-NEXT:    v_mov_b32_e32 v29, s3
+; SDAG-NEXT:    v_mov_b32_e32 v16, v10
+; SDAG-NEXT:    v_mov_b32_e32 v15, v9
+; SDAG-NEXT:    v_mov_b32_e32 v14, v8
+; SDAG-NEXT:    v_mov_b32_e32 v13, v7
+; SDAG-NEXT:    v_mov_b32_e32 v12, v6
+; SDAG-NEXT:    v_mov_b32_e32 v11, v5
+; SDAG-NEXT:    v_mov_b32_e32 v10, v4
+; SDAG-NEXT:    v_mov_b32_e32 v9, v3
+; SDAG-NEXT:    v_mov_b32_e32 v8, v2
+; SDAG-NEXT:    v_mov_b32_e32 v7, v1
+; SDAG-NEXT:    v_mov_b32_e32 v6, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s24
+; SDAG-NEXT:    v_mov_b32_e32 v1, s25
+; SDAG-NEXT:    v_mov_b32_e32 v2, s26
+; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
+; SDAG-NEXT:    v_mov_b32_e32 v5, s29
+; SDAG-NEXT:    v_mov_b32_e32 v18, s16
+; SDAG-NEXT:    v_mov_b32_e32 v19, s17
+; SDAG-NEXT:    v_mov_b32_e32 v20, s18
+; SDAG-NEXT:    v_mov_b32_e32 v21, s19
+; SDAG-NEXT:    v_mov_b32_e32 v22, s20
+; SDAG-NEXT:    v_mov_b32_e32 v23, s21
+; SDAG-NEXT:    v_mov_b32_e32 v24, s22
+; SDAG-NEXT:    v_mov_b32_e32 v25, s23
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
@@ -2685,53 +2586,37 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, s0
+; SDAG-NEXT:    v_mov_b32_e32 v27, s1
+; SDAG-NEXT:    v_mov_b32_e32 v28, s2
+; SDAG-NEXT:    v_mov_b32_e32 v29, s3
+; SDAG-NEXT:    v_mov_b32_e32 v16, v10
+; SDAG-NEXT:    v_mov_b32_e32 v15, v9
+; SDAG-NEXT:    v_mov_b32_e32 v14, v8
+; SDAG-NEXT:    v_mov_b32_e32 v13, v7
+; SDAG-NEXT:    v_mov_b32_e32 v12, v6
+; SDAG-NEXT:    v_mov_b32_e32 v11, v5
+; SDAG-NEXT:    v_mov_b32_e32 v10, v4
+; SDAG-NEXT:    v_mov_b32_e32 v9, v3
+; SDAG-NEXT:    v_mov_b32_e32 v8, v2
+; SDAG-NEXT:    v_mov_b32_e32 v7, v1
+; SDAG-NEXT:    v_mov_b32_e32 v6, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s24
+; SDAG-NEXT:    v_mov_b32_e32 v1, s25
+; SDAG-NEXT:    v_mov_b32_e32 v2, s26
+; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
+; SDAG-NEXT:    v_mov_b32_e32 v5, s29
+; SDAG-NEXT:    v_mov_b32_e32 v18, s16
+; SDAG-NEXT:    v_mov_b32_e32 v19, s17
+; SDAG-NEXT:    v_mov_b32_e32 v20, s18
+; SDAG-NEXT:    v_mov_b32_e32 v21, s19
+; SDAG-NEXT:    v_mov_b32_e32 v22, s20
+; SDAG-NEXT:    v_mov_b32_e32 v23, s21
+; SDAG-NEXT:    v_mov_b32_e32 v24, s22
+; SDAG-NEXT:    v_mov_b32_e32 v25, s23
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
@@ -3052,53 +2937,37 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, s0
+; SDAG-NEXT:    v_mov_b32_e32 v27, s1
+; SDAG-NEXT:    v_mov_b32_e32 v28, s2
+; SDAG-NEXT:    v_mov_b32_e32 v29, s3
+; SDAG-NEXT:    v_mov_b32_e32 v16, v10
+; SDAG-NEXT:    v_mov_b32_e32 v15, v9
+; SDAG-NEXT:    v_mov_b32_e32 v14, v8
+; SDAG-NEXT:    v_mov_b32_e32 v13, v7
+; SDAG-NEXT:    v_mov_b32_e32 v12, v6
+; SDAG-NEXT:    v_mov_b32_e32 v11, v5
+; SDAG-NEXT:    v_mov_b32_e32 v10, v4
+; SDAG-NEXT:    v_mov_b32_e32 v9, v3
+; SDAG-NEXT:    v_mov_b32_e32 v8, v2
+; SDAG-NEXT:    v_mov_b32_e32 v7, v1
+; SDAG-NEXT:    v_mov_b32_e32 v6, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s24
+; SDAG-NEXT:    v_mov_b32_e32 v1, s25
+; SDAG-NEXT:    v_mov_b32_e32 v2, s26
+; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
+; SDAG-NEXT:    v_mov_b32_e32 v5, s29
+; SDAG-NEXT:    v_mov_b32_e32 v18, s16
+; SDAG-NEXT:    v_mov_b32_e32 v19, s17
+; SDAG-NEXT:    v_mov_b32_e32 v20, s18
+; SDAG-NEXT:    v_mov_b32_e32 v21, s19
+; SDAG-NEXT:    v_mov_b32_e32 v22, s20
+; SDAG-NEXT:    v_mov_b32_e32 v23, s21
+; SDAG-NEXT:    v_mov_b32_e32 v24, s22
+; SDAG-NEXT:    v_mov_b32_e32 v25, s23
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
@@ -3419,53 +3288,37 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, s0
+; SDAG-NEXT:    v_mov_b32_e32 v27, s1
+; SDAG-NEXT:    v_mov_b32_e32 v28, s2
+; SDAG-NEXT:    v_mov_b32_e32 v29, s3
+; SDAG-NEXT:    v_mov_b32_e32 v16, v10
+; SDAG-NEXT:    v_mov_b32_e32 v15, v9
+; SDAG-NEXT:    v_mov_b32_e32 v14, v8
+; SDAG-NEXT:    v_mov_b32_e32 v13, v7
+; SDAG-NEXT:    v_mov_b32_e32 v12, v6
+; SDAG-NEXT:    v_mov_b32_e32 v11, v5
+; SDAG-NEXT:    v_mov_b32_e32 v10, v4
+; SDAG-NEXT:    v_mov_b32_e32 v9, v3
+; SDAG-NEXT:    v_mov_b32_e32 v8, v2
+; SDAG-NEXT:    v_mov_b32_e32 v7, v1
+; SDAG-NEXT:    v_mov_b32_e32 v6, v0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s24
+; SDAG-NEXT:    v_mov_b32_e32 v1, s25
+; SDAG-NEXT:    v_mov_b32_e32 v2, s26
+; SDAG-NEXT:    v_mov_b32_e32 v3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v4, s28
+; SDAG-NEXT:    v_mov_b32_e32 v5, s29
+; SDAG-NEXT:    v_mov_b32_e32 v18, s16
+; SDAG-NEXT:    v_mov_b32_e32 v19, s17
+; SDAG-NEXT:    v_mov_b32_e32 v20, s18
+; SDAG-NEXT:    v_mov_b32_e32 v21, s19
+; SDAG-NEXT:    v_mov_b32_e32 v22, s20
+; SDAG-NEXT:    v_mov_b32_e32 v23, s21
+; SDAG-NEXT:    v_mov_b32_e32 v24, s22
+; SDAG-NEXT:    v_mov_b32_e32 v25, s23
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
-; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index b9e9893ede4e2..9a23788f8855a 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -369,7 +369,7 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; CHECK-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -378,73 +378,66 @@ define amdgpu_kernel void @illegal_mfma_after_rewrite() #1 {
 ; CHECK-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[4:7]
-; CHECK-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; CHECK-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
 ; CHECK-NEXT:    s_mov_b32 s0, 0x7e007e00
 ; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[12:13], v[4:7]
-; CHECK-NEXT:    s_nop 1
-; CHECK-NEXT:    v_accvgpr_write_b32 a0, v0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[18:21], v[8:9], v[10:11], v[4:7]
-; CHECK-NEXT:    v_accvgpr_write_b32 a1, v1
-; CHECK-NEXT:    v_accvgpr_write_b32 a2, v2
-; CHECK-NEXT:    v_accvgpr_write_b32 a3, v3
+; CHECK-NEXT:    v_accvgpr_write_b32 a0, s0
+; CHECK-NEXT:    v_accvgpr_write_b32 a1, s1
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[4:7]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[30:31], v[4:7]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[12:15], v[28:29], a[0:1], v[4:7]
+; CHECK-NEXT:    s_nop 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; CHECK-NEXT:    v_mov_b32_e32 v5, v4
 ; CHECK-NEXT:    v_mov_b32_e32 v6, v4
 ; CHECK-NEXT:    v_mov_b32_e32 v7, v4
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[14:17]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[8:11]
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[4:7]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[4:7]
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def v[4:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[12:13], v[4:7]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[26:29], v[8:9], v[8:9], v[4:7]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[8:9], v[8:9], v[0:3]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[22:25], v[8:9], v[8:9], v[22:25]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[8:9], v[8:9], v[26:29]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[28:29], v[28:29], v[16:19]
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[24:27], v[28:29], v[30:31], v[4:7]
 ; CHECK-NEXT:    s_nop 5
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v23, v14
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[14:17], v[8:9], v[8:9], v[18:21]
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[12:13], v[8:9], v[0:3]
-; CHECK-NEXT:    s_nop 1
-; CHECK-NEXT:    v_accvgpr_read_b32 v19, a3
-; CHECK-NEXT:    v_accvgpr_read_b32 v18, a2
-; CHECK-NEXT:    v_mov_b64_e32 v[20:21], 0
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    v_accvgpr_read_b32 v17, a1
-; CHECK-NEXT:    v_accvgpr_read_b32 v16, a0
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v15, v22
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[16:19], v[8:9], v[8:9], v[16:19]
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v12, v0
-; CHECK-NEXT:    global_store_short v[20:21], v23, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v17, v8
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[12:15]
+; CHECK-NEXT:    s_nop 2
+; CHECK-NEXT:    v_mov_b64_e32 v[12:13], 0
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[28:29], v[28:29], v[0:3]
+; CHECK-NEXT:    global_store_short v[12:13], v17, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[10:11], v[8:9], v[4:7]
-; CHECK-NEXT:    global_store_short v[20:21], v15, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v9, v16
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[20:23], v[28:29], v[28:29], v[4:7]
+; CHECK-NEXT:    global_store_short v[12:13], v9, off
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v1, v8
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[8:11], v[28:29], v[28:29], v[24:27]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v14, off
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v16
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v14, v0
+; CHECK-NEXT:    global_store_short v[12:13], v1, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[4:7], v[28:29], v[28:29], v[20:23]
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v14, off
-; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CHECK-NEXT:    global_store_short v[12:13], v14, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v12, off
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], v[30:31], v[28:29], v[8:11]
+; CHECK-NEXT:    s_nop 6
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v8, v0
+; CHECK-NEXT:    v_mfma_f32_16x16x16_f16 v[0:3], a[0:1], v[28:29], v[4:7]
+; CHECK-NEXT:    global_store_short v[12:13], v8, off
 ; CHECK-NEXT:    buffer_wbl2 sc0 sc1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_inv sc0 sc1
-; CHECK-NEXT:    global_store_short v[20:21], v0, off
+; CHECK-NEXT:    s_nop 2
+; CHECK-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CHECK-NEXT:    global_store_short v[12:13], v0, off
 ; CHECK-NEXT:    s_endpgm
 entry:
   %k0 = call <4 x float> asm sideeffect "; def $0", "=s"()



More information about the llvm-branch-commits mailing list