[llvm] b3d5056 - AMDGPU: Try to perform copy to agpr from reg_sequence at the copy (#129463)

via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 3 23:42:00 PST 2025


Author: Matt Arsenault
Date: 2025-03-04T14:41:56+07:00
New Revision: b3d5056c79b5496fc8751630ddd0ac0071cab0af

URL: https://github.com/llvm/llvm-project/commit/b3d5056c79b5496fc8751630ddd0ac0071cab0af
DIFF: https://github.com/llvm/llvm-project/commit/b3d5056c79b5496fc8751630ddd0ac0071cab0af.diff

LOG: AMDGPU: Try to perform copy to agpr from reg_sequence at the copy (#129463)

SIFoldOperands is frustratingly written in a def-folds-into-use
iteration pattern, with a few random cases starting at the uses.
We were handling this case by looking at the reg_sequence, and finding
the copy. This did not work for the most basic pattern of materializing
a vector constant that started in SGPRs. It just happens there is an
optimization bug in SelectionDAG that produced the expected pattern.

Perform an additional attempt at the fold rooted at the copy. This
mostly shows test improvements. There were some tricky updates to
perform. remaining-virtual-register-operands.ll managed to stop failing
the allocator, so needed to be tricked into failing again. I also do
not understand what schedule-xdl-resource.ll is trying to do for the test
so this changes it to some random output that exists in the debug output.

Added: 
    llvm/test/CodeGen/AMDGPU/coalesces-better.mir
    llvm/test/CodeGen/AMDGPU/coalesces-worse.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
    llvm/test/CodeGen/AMDGPU/acc-ldst.ll
    llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
    llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
    llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
    llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
    llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll
    llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
    llvm/test/CodeGen/AMDGPU/si-fold-operands-agpr-copy-reg-sequence.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index eb9aabf8b6317..3a3f303293461 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1510,76 +1510,128 @@ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
   // only accept VGPR or inline immediate. Recreate a reg_sequence with its
   // initializers right here, so we will rematerialize immediates and avoid
   // copies via 
diff erent reg classes.
-  if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg()))
+  const TargetRegisterClass *DefRC =
+      MRI->getRegClass(CopyMI->getOperand(0).getReg());
+  if (!TRI->isAGPRClass(DefRC))
     return false;
+
   Register UseReg = CopyMI->getOperand(1).getReg();
-  SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
-  if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32))
+  MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
+  if (!RegSeq || !RegSeq->isRegSequence())
     return false;
 
   const DebugLoc &DL = CopyMI->getDebugLoc();
   MachineBasicBlock &MBB = *CopyMI->getParent();
 
+  MachineInstrBuilder B(*MBB.getParent(), CopyMI);
+  DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
+  SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenInputs;
+
+  const TargetRegisterClass *UseRC =
+      MRI->getRegClass(CopyMI->getOperand(1).getReg());
+
+  // Value, subregindex for new REG_SEQUENCE
+  SmallVector<std::pair<MachineOperand *, unsigned>, 32> NewDefs;
+
+  unsigned NumRegSeqOperands = RegSeq->getNumOperands();
+  unsigned NumFoldable = 0;
+
+  for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
+    MachineOperand &RegOp = RegSeq->getOperand(I);
+    unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();
+
+    if (RegOp.getSubReg()) {
+      // TODO: Handle subregister compose
+      NewDefs.emplace_back(&RegOp, SubRegIdx);
+      continue;
+    }
+
+    MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
+    if (!Lookup)
+      Lookup = &RegOp;
+
+    if (Lookup->isImm()) {
+      // Check if this is an agpr_32 subregister.
+      const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
+          DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
+      if (DestSuperRC &&
+          TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+        ++NumFoldable;
+        NewDefs.emplace_back(Lookup, SubRegIdx);
+        continue;
+      }
+    }
+
+    const TargetRegisterClass *InputRC =
+        Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
+                        : MRI->getRegClass(RegOp.getReg());
+
+    // TODO: Account for Lookup->getSubReg()
+
+    // If we can't find a matching super class, this is an SGPR->AGPR or
+    // VGPR->AGPR subreg copy (or something constant-like we have to materialize
+    // in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
+    // want to rewrite to copy to an intermediate VGPR class.
+    const TargetRegisterClass *MatchRC =
+        TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
+    if (!MatchRC) {
+      ++NumFoldable;
+      NewDefs.emplace_back(&RegOp, SubRegIdx);
+      continue;
+    }
+
+    NewDefs.emplace_back(&RegOp, SubRegIdx);
+  }
+
+  // Do not clone a reg_sequence and merely change the result register class.
+  if (NumFoldable == 0)
+    return false;
+
   CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
   for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
     CopyMI->removeOperand(I);
 
-  MachineInstrBuilder B(*MBB.getParent(), CopyMI);
-  DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
-  SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
-  for (unsigned I = 0, NumElts = Defs.size(); I != NumElts; ++I) {
-    MachineOperand *Def = Defs[I].first;
-    TargetInstrInfo::RegSubRegPair CopyToVGPR;
-    if (Def->isImm() &&
-        TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
-      int64_t Imm = Def->getImm();
-
-      auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+  for (auto [Def, DestSubIdx] : NewDefs) {
+    if (!Def->isReg()) {
+      // TODO: Should we use single write for each repeated value like in
+      // register case?
+      Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
       BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
-          .addImm(Imm);
+          .add(*Def);
       B.addReg(Tmp);
-    } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
-      auto Src = getRegSubRegPair(*Def);
+    } else {
+      TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
       Def->setIsKill(false);
-      if (!SeenAGPRs.insert(Src)) {
+
+      Register &VGPRCopy = VGPRCopies[Src];
+      if (!VGPRCopy) {
+        const TargetRegisterClass *VGPRUseSubRC =
+            TRI->getSubRegisterClass(UseRC, DestSubIdx);
+
         // We cannot build a reg_sequence out of the same registers, they
         // must be copied. Better do it here before copyPhysReg() created
         // several reads to do the AGPR->VGPR->AGPR copy.
-        CopyToVGPR = Src;
-      } else {
-        B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, Src.SubReg);
-      }
-    } else {
-      assert(Def->isReg());
-      Def->setIsKill(false);
-      auto Src = getRegSubRegPair(*Def);
 
-      // Direct copy from SGPR to AGPR is not possible. To avoid creation
-      // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
-      // create a copy here and track if we already have such a copy.
-      if (TRI->isSGPRReg(*MRI, Src.Reg)) {
-        CopyToVGPR = Src;
+        // Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
+        // creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
+        // later, create a copy here and track if we already have such a copy.
+        if (TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg) !=
+            VGPRUseSubRC) {
+          VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
+          BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
+          B.addReg(VGPRCopy);
+        } else {
+          // If it is already a VGPR, do not copy the register.
+          B.add(*Def);
+        }
       } else {
-        auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
-        BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
-        B.addReg(Tmp);
+        B.addReg(VGPRCopy);
       }
     }
 
-    if (CopyToVGPR.Reg) {
-      auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR);
-      Register &Vgpr = It->second;
-      if (Inserted) {
-        Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-        BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
-      }
-      Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
-      BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).addReg(Vgpr);
-      B.addReg(Tmp);
-    }
-
-    B.addImm(Defs[I].second);
+    B.addImm(DestSubIdx);
   }
+
   LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
   return true;
 }
@@ -1634,6 +1686,13 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
       foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI))
     return true;
 
+  // Fold copy to AGPR through reg_sequence
+  // TODO: Handle with subregister extract
+  if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
+    if (foldCopyToAGPRRegSequence(&MI))
+      return true;
+  }
+
   bool Changed = foldInstOperand(MI, OpToFold);
 
   // If we managed to fold all uses of this copy then we might as well

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index a31064e293622..3f5a99cad9543 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -16,13 +16,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GCN-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
 ; GCN-NEXT:    s_mov_b64 s[36:37], 1
 ; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1]
-; GCN-NEXT:    s_mov_b32 s36, 2
-; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[36:37], s[36:37] op_sel:[0,1]
+; GCN-NEXT:    s_mov_b32 s38, 2
+; GCN-NEXT:    s_mov_b32 s39, s37
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x0
 ; GCN-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
+; GCN-NEXT:    v_accvgpr_write_b32 a16, s16
 ; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
 ; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
@@ -38,7 +40,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
 ; GCN-NEXT:    v_accvgpr_write_b32 a13, s13
 ; GCN-NEXT:    v_accvgpr_write_b32 a14, s14
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, s15
-; GCN-NEXT:    v_accvgpr_write_b32 a16, s16
 ; GCN-NEXT:    v_accvgpr_write_b32 a17, s17
 ; GCN-NEXT:    v_accvgpr_write_b32 a18, s18
 ; GCN-NEXT:    v_accvgpr_write_b32 a19, s19
@@ -317,31 +318,29 @@ bb:
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
 ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GCN-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
-; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[10:11], s[4:5], 0x34
 ; GCN-NEXT:    s_mov_b64 s[6:7], 1.0
-; GCN-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GCN-NEXT:    s_mov_b64 s[8:9], 0
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
-; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
-; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
-; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
-; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
-; GCN-NEXT:    v_accvgpr_write_b32 a4, s4
-; GCN-NEXT:    v_accvgpr_write_b32 a5, s5
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s8
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s8
 ; GCN-NEXT:    v_accvgpr_write_b32 a6, s6
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s9
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s9
 ; GCN-NEXT:    v_accvgpr_write_b32 a7, s7
-; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
-; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GCN-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
@@ -352,32 +351,29 @@ bb:
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 {
 ; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
-; GCN-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x34
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    s_mov_b32 s1, 0x405ec000
-; GCN-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0x405ec000
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
-; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GCN-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
-; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
-; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
-; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
-; GCN-NEXT:    v_accvgpr_write_b32 a4, s4
-; GCN-NEXT:    v_accvgpr_write_b32 a5, s5
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s6
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s6
 ; GCN-NEXT:    v_accvgpr_write_b32 a6, s6
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s7
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s7
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s7
 ; GCN-NEXT:    v_accvgpr_write_b32 a7, s7
-; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[8:9], s[8:9] op_sel:[0,1]
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
-; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
 ; GCN-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index 5c484e1e52da8..726bfbab7ad48 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -190,11 +190,11 @@ bb:
 
 ; NB: for atomics both vdata and vdst shall be either VGPR or AGPR
 ; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic_store:
+; GCN: v_accvgpr_write_b32 [[A_ZERO:a[0-9]+]], 0
 ; GCN:     global_atomic_sub [[IN:v[0-9]+]], v{{[0-9:]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}] glc
+; GCN-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, [[A_ZERO]]
+; GCN-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, [[A_ZERO]]
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[IN]]
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:     v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:
 ; GCN:     v_accvgpr_read_b32 [[V:v[0-9]+]], a[[N]]{{$}}
 ; GCN:     global_atomic_add v{{[0-9]+}}, v{{[0-9:]+}}, [[V]], s[{{[0-9:]+}}] glc
@@ -217,7 +217,10 @@ bb:
 
 ; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic64_store:
 ; GCN:         global_atomic_sub_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
-; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_accvgpr_write_b32 [[A_ZERO:a[0-9]+]], 0
+; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, [[A_ZERO]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:         v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:
 ; GCN:         v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
 ; GCN:         v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}

diff  --git a/llvm/test/CodeGen/AMDGPU/coalesces-better.mir b/llvm/test/CodeGen/AMDGPU/coalesces-better.mir
new file mode 100644
index 0000000000000..593220d879c2a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/coalesces-better.mir
@@ -0,0 +1,74 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 -start-after=si-fold-operands -o - %s | FileCheck %s
+
+--- |
+  target triple = "amdgcn-mesa-mesa3d"
+
+  define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) #0 {
+  ; CHECK-LABEL: test_smfmac_f32_16x16x64_f16:
+  ; CHECK:       ; %bb.0:
+  ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; CHECK-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
+  ; CHECK-NEXT:    s_nop 7
+  ; CHECK-NEXT:    v_mov_b32_e32 v0, v12
+  ; CHECK-NEXT:    v_mov_b32_e32 v1, v13
+  ; CHECK-NEXT:    v_mov_b32_e32 v2, v14
+  ; CHECK-NEXT:    v_mov_b32_e32 v3, v15
+  ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+    %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
+    ret <4 x float> %result
+  }
+
+  ; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
+  declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half>, <16 x half>, <4 x float>, i32, i32 immarg, i32 immarg) #1
+
+  attributes #0 = { "target-cpu"="gfx950" }
+  attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx950" }
+
+...
+---
+name:            test_smfmac_f32_16x16x64_f16
+tracksRegLiveness: true
+isSSA:           true
+machineFunctionInfo:
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  sgprForEXECCopy: '$sgpr100_sgpr101'
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16
+
+
+    %0:vgpr_32 = COPY $vgpr16
+    %1:vgpr_32 = COPY $vgpr15
+    %2:vgpr_32 = COPY $vgpr14
+    %3:vgpr_32 = COPY $vgpr13
+    %4:vgpr_32 = COPY $vgpr12
+    %5:vgpr_32 = COPY $vgpr11
+    %6:vgpr_32 = COPY $vgpr10
+    %7:vgpr_32 = COPY $vgpr9
+    %8:vgpr_32 = COPY $vgpr8
+    %9:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %11:vgpr_32 = COPY $vgpr5
+    %12:vgpr_32 = COPY $vgpr4
+    %13:vgpr_32 = COPY $vgpr3
+    %14:vgpr_32 = COPY $vgpr2
+    %15:vgpr_32 = COPY $vgpr1
+    %16:vgpr_32 = COPY $vgpr0
+    %17:vreg_256_align2 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3, %8, %subreg.sub4, %7, %subreg.sub5, %6, %subreg.sub6, %5, %subreg.sub7
+    %18:vreg_128_align2 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1, %2, %subreg.sub2, %1, %subreg.sub3
+    %19:vreg_128_align2 = REG_SEQUENCE %16, %subreg.sub0, %15, %subreg.sub1, %14, %subreg.sub2, %13, %subreg.sub3
+    %24:areg_128_align2 = COPY %18
+    %25:areg_128_align2 = V_SMFMAC_F32_16X16X64_F16_e64 %19, %17, %0, 0, 0, %24, implicit $mode, implicit $exec
+    %26:vgpr_32 = COPY %25.sub0
+    %27:vgpr_32 = COPY %25.sub1
+    %28:vgpr_32 = COPY %25.sub2
+    %29:vgpr_32 = COPY %25.sub3
+    $vgpr0 = COPY %26
+    $vgpr1 = COPY %27
+    $vgpr2 = COPY %28
+    $vgpr3 = COPY %29
+    SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/coalesces-worse.mir b/llvm/test/CodeGen/AMDGPU/coalesces-worse.mir
new file mode 100644
index 0000000000000..0718f825fbacb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/coalesces-worse.mir
@@ -0,0 +1,71 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 -start-after=si-fold-operands -o - %s | FileCheck %s
+
+--- |
+  target triple = "amdgcn-mesa-mesa3d"
+
+  define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) #0 {
+  ; CHECK-LABEL: test_smfmac_f32_16x16x64_f16:
+  ; CHECK:       ; %bb.0:
+  ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+  ; CHECK-NEXT:    v_accvgpr_write_b32 a0, v12
+  ; CHECK-NEXT:    v_accvgpr_write_b32 a1, v13
+  ; CHECK-NEXT:    v_accvgpr_write_b32 a2, v14
+  ; CHECK-NEXT:    v_accvgpr_write_b32 a3, v15
+  ; CHECK-NEXT:    s_nop 1
+  ; CHECK-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16
+  ; CHECK-NEXT:    s_nop 7
+  ; CHECK-NEXT:    v_accvgpr_read_b32 v0, a0
+  ; CHECK-NEXT:    v_accvgpr_read_b32 v1, a1
+  ; CHECK-NEXT:    v_accvgpr_read_b32 v2, a2
+  ; CHECK-NEXT:    v_accvgpr_read_b32 v3, a3
+  ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+    %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
+    ret <4 x float> %result
+  }
+
+  declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half>, <16 x half>, <4 x float>, i32, i32 immarg, i32 immarg) #1
+
+  attributes #0 = { "target-cpu"="gfx950" }
+  attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx950" }
+
+...
+---
+name:            test_smfmac_f32_16x16x64_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16
+
+    %0:vgpr_32 = COPY $vgpr16
+    %1:vgpr_32 = COPY $vgpr15
+    %2:vgpr_32 = COPY $vgpr14
+    %3:vgpr_32 = COPY $vgpr13
+    %4:vgpr_32 = COPY $vgpr12
+    %5:vgpr_32 = COPY $vgpr11
+    %6:vgpr_32 = COPY $vgpr10
+    %7:vgpr_32 = COPY $vgpr9
+    %8:vgpr_32 = COPY $vgpr8
+    %9:vgpr_32 = COPY $vgpr7
+    %10:vgpr_32 = COPY $vgpr6
+    %11:vgpr_32 = COPY $vgpr5
+    %12:vgpr_32 = COPY $vgpr4
+    %13:vgpr_32 = COPY $vgpr3
+    %14:vgpr_32 = COPY $vgpr2
+    %15:vgpr_32 = COPY $vgpr1
+    %16:vgpr_32 = COPY $vgpr0
+    %17:vreg_256_align2 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3, %8, %subreg.sub4, %7, %subreg.sub5, %6, %subreg.sub6, %5, %subreg.sub7
+    %18:vreg_128_align2 = REG_SEQUENCE %4, %subreg.sub0, %3, %subreg.sub1, %2, %subreg.sub2, %1, %subreg.sub3
+    %19:vreg_128_align2 = REG_SEQUENCE %16, %subreg.sub0, %15, %subreg.sub1, %14, %subreg.sub2, %13, %subreg.sub3
+    %20:areg_128_align2 = V_SMFMAC_F32_16X16X64_F16_e64 %19, %17, %0, 0, 0, %18, implicit $mode, implicit $exec
+    %21:vgpr_32 = COPY %20.sub0
+    %22:vgpr_32 = COPY %20.sub1
+    %23:vgpr_32 = COPY %20.sub2
+    %24:vgpr_32 = COPY %20.sub3
+    $vgpr0 = COPY %21
+    $vgpr1 = COPY %22
+    $vgpr2 = COPY %23
+    $vgpr3 = COPY %24
+    SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir b/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir
index a9b3eaf4c33a3..7e81b78ac378c 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-agpr-phis.mir
@@ -796,3 +796,39 @@ body:             |
     S_ENDPGM 0
 
 ...
+
+
+
+
+---
+name:            phi_output_reg_type_is_vgpr
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $scc
+    successors: %bb.1
+
+    %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %1:sgpr_32 = S_MOV_B32 0
+    %2:sgpr_128 = REG_SEQUENCE %1, %subreg.sub0, %1, %subreg.sub1, %1, %subreg.sub2, %1, %subreg.sub3
+    %3:vreg_128 = COPY %2
+    %4:sreg_64 = S_MOV_B64 0
+    %5:areg_128_align2 = COPY %3, implicit $exec
+
+  bb.1:
+    liveins: $scc
+    successors: %bb.1, %bb.2
+
+    %9:areg_128_align2 = PHI %5, %bb.0, %10, %bb.1
+    %11:areg_128_align2 = V_MFMA_F32_16X16X4F32_e64 %0:vgpr_32, %0:vgpr_32, %9:areg_128_align2, 0, 0, 0, implicit $mode, implicit $exec
+    %12:vgpr_32 = COPY %11.sub3
+    %13:vgpr_32 = COPY %11.sub2
+    %14:vgpr_32 = COPY %11.sub1
+    %15:vgpr_32 = COPY %11.sub0
+    %10:areg_128_align2 = COPY %11, implicit $exec
+    S_CBRANCH_SCC1 %bb.1, implicit $scc
+
+  bb.2:
+    S_ENDPGM 0
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 25b857f8f47dd..7cc726a3bd79c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -1494,10 +1494,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s19
 ; SDAG-NEXT:    v_mov_b32_e32 v20, s28
-; SDAG-NEXT:    v_mov_b32_e32 v23, v1
-; SDAG-NEXT:    v_mov_b32_e32 v22, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v21, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v20
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s22
@@ -1506,9 +1503,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    v_mov_b32_e32 v9, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v10, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v11, s27
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v20
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v0
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v1
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
@@ -1531,17 +1529,15 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
 ; GISEL-NEXT:    v_mov_b32_e32 v20, s28
-; GISEL-NEXT:    v_mov_b32_e32 v22, v0
-; GISEL-NEXT:    v_mov_b32_e32 v23, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v21, s29
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v20
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v20
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v21
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v22
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v0
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v1
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
@@ -1667,7 +1663,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s2
@@ -1676,6 +1671,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v20, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v21, s19
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
@@ -1697,10 +1693,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
 ; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
@@ -1817,8 +1813,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
 ; SDAG-NEXT:    s_movk_i32 s0, 0x41
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
@@ -1857,8 +1853,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
 ; SDAG-NEXT:    s_movk_i32 s0, 0x41
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
@@ -2365,4 +2361,4 @@ declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8
 declare <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32>, <6 x i32>, <4 x float>, i32 immarg, i32 immarg, i32 immarg, i32, i32 immarg, i32) #1
 
 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
-attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
\ No newline at end of file
+attributes #1 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 3d959393a8fa7..dac54c9f85e96 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -3515,56 +3515,42 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v32, s0
-; SDAG-NEXT:    v_mov_b32_e32 v33, s1
-; SDAG-NEXT:    v_mov_b32_e32 v34, s2
-; SDAG-NEXT:    v_mov_b32_e32 v35, s3
-; SDAG-NEXT:    v_mov_b32_e32 v36, s16
-; SDAG-NEXT:    v_mov_b32_e32 v37, s17
-; SDAG-NEXT:    v_mov_b32_e32 v38, s18
-; SDAG-NEXT:    v_mov_b32_e32 v39, s19
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v31, v13
-; SDAG-NEXT:    v_mov_b32_e32 v30, v12
-; SDAG-NEXT:    v_mov_b32_e32 v29, v11
-; SDAG-NEXT:    v_mov_b32_e32 v28, v10
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT:    v_mov_b32_e32 v0, s20
-; SDAG-NEXT:    v_mov_b32_e32 v1, s21
-; SDAG-NEXT:    v_mov_b32_e32 v2, s22
-; SDAG-NEXT:    v_mov_b32_e32 v3, s23
-; SDAG-NEXT:    v_mov_b32_e32 v4, s24
-; SDAG-NEXT:    v_mov_b32_e32 v5, s25
-; SDAG-NEXT:    v_mov_b32_e32 v6, s26
-; SDAG-NEXT:    v_mov_b32_e32 v7, s27
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v31
+; SDAG-NEXT:    v_mov_b32_e32 v24, s0
+; SDAG-NEXT:    v_mov_b32_e32 v25, s1
+; SDAG-NEXT:    v_mov_b32_e32 v26, s2
+; SDAG-NEXT:    v_mov_b32_e32 v27, s3
+; SDAG-NEXT:    v_mov_b32_e32 v28, s16
+; SDAG-NEXT:    v_mov_b32_e32 v29, s17
+; SDAG-NEXT:    v_mov_b32_e32 v30, s18
+; SDAG-NEXT:    v_mov_b32_e32 v31, s19
+; SDAG-NEXT:    v_mov_b32_e32 v32, s28
+; SDAG-NEXT:    v_mov_b32_e32 v33, s29
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
+; SDAG-NEXT:    v_mov_b32_e32 v20, s24
+; SDAG-NEXT:    v_mov_b32_e32 v21, s25
+; SDAG-NEXT:    v_mov_b32_e32 v22, s26
+; SDAG-NEXT:    v_mov_b32_e32 v23, s27
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v32
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v33
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v0
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v1
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v2
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v3
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v4
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v5
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v6
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v7
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v8
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v13
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[24:31], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -3593,48 +3579,34 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    s_mov_b32 s13, s1
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
-; GISEL-NEXT:    v_mov_b32_e32 v18, v0
-; GISEL-NEXT:    v_mov_b32_e32 v19, v1
-; GISEL-NEXT:    v_mov_b32_e32 v20, v2
-; GISEL-NEXT:    v_mov_b32_e32 v21, v3
-; GISEL-NEXT:    v_mov_b32_e32 v22, v4
-; GISEL-NEXT:    v_mov_b32_e32 v23, v5
-; GISEL-NEXT:    v_mov_b32_e32 v24, v6
-; GISEL-NEXT:    v_mov_b32_e32 v25, v7
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s28
-; GISEL-NEXT:    v_mov_b32_e32 v26, v8
-; GISEL-NEXT:    v_mov_b32_e32 v27, v9
-; GISEL-NEXT:    v_mov_b32_e32 v28, v10
-; GISEL-NEXT:    v_mov_b32_e32 v29, v11
-; GISEL-NEXT:    v_mov_b32_e32 v30, v12
-; GISEL-NEXT:    v_mov_b32_e32 v31, v13
-; GISEL-NEXT:    v_mov_b32_e32 v17, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[26:27]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[20:21]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, v31
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-NEXT:    v_mov_b32_e32 v32, s28
+; GISEL-NEXT:    v_mov_b32_e32 v33, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[20:21]
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v32
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v33
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v0
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v1
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v2
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v3
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v4
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v5
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v6
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v7
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v13
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 3
@@ -3871,7 +3843,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v27, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s2
@@ -3880,6 +3851,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v31, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v32, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s19
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
@@ -3926,10 +3898,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
 ; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[18:19]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[12:13]
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
 ; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
@@ -3972,93 +3944,47 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, s16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, s17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, s18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, s20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, s21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, s22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, s24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, s25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, s26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, s27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0]
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s12, s0
-; GISEL-NEXT:    s_mov_b32 s13, s1
-; GISEL-NEXT:    s_mov_b32 s14, s2
-; GISEL-NEXT:    s_mov_b32 s15, s3
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s12
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s13
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s14
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s15
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, s16
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, s17
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, s18
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, s19
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, s20
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, s21
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, s22
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, s23
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, s24
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, s25
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, s26
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, s27
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0]
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 3
-; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, s18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, s19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, s20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, s21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, s22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, s23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, s24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, s25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, s26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, s27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, s28 op_sel_hi:[0,0,0]
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <16 x float> %result
 }
@@ -4067,48 +3993,42 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v32, s0
-; SDAG-NEXT:    v_mov_b32_e32 v33, s1
-; SDAG-NEXT:    v_mov_b32_e32 v34, s2
-; SDAG-NEXT:    v_mov_b32_e32 v35, s3
-; SDAG-NEXT:    v_mov_b32_e32 v36, s16
-; SDAG-NEXT:    v_mov_b32_e32 v37, s17
-; SDAG-NEXT:    v_mov_b32_e32 v38, s18
-; SDAG-NEXT:    v_mov_b32_e32 v39, s19
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v31, v13
-; SDAG-NEXT:    v_mov_b32_e32 v30, v12
-; SDAG-NEXT:    v_mov_b32_e32 v29, v11
-; SDAG-NEXT:    v_mov_b32_e32 v28, v10
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    v_mov_b32_e32 v20, s24
-; SDAG-NEXT:    v_mov_b32_e32 v21, s25
-; SDAG-NEXT:    v_mov_b32_e32 v22, s26
-; SDAG-NEXT:    v_mov_b32_e32 v23, s27
-; SDAG-NEXT:    v_mov_b32_e32 v24, s28
-; SDAG-NEXT:    v_mov_b32_e32 v25, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v31
+; SDAG-NEXT:    v_mov_b32_e32 v16, s0
+; SDAG-NEXT:    v_mov_b32_e32 v17, s1
+; SDAG-NEXT:    v_mov_b32_e32 v18, s2
+; SDAG-NEXT:    v_mov_b32_e32 v19, s3
+; SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; SDAG-NEXT:    v_mov_b32_e32 v21, s17
+; SDAG-NEXT:    v_mov_b32_e32 v22, s18
+; SDAG-NEXT:    v_mov_b32_e32 v23, s19
+; SDAG-NEXT:    v_mov_b32_e32 v24, s20
+; SDAG-NEXT:    v_mov_b32_e32 v25, s21
+; SDAG-NEXT:    v_mov_b32_e32 v26, s22
+; SDAG-NEXT:    v_mov_b32_e32 v27, s23
+; SDAG-NEXT:    v_mov_b32_e32 v28, s24
+; SDAG-NEXT:    v_mov_b32_e32 v29, s25
+; SDAG-NEXT:    v_mov_b32_e32 v30, s26
+; SDAG-NEXT:    v_mov_b32_e32 v31, s27
+; SDAG-NEXT:    v_mov_b32_e32 v32, s28
+; SDAG-NEXT:    v_mov_b32_e32 v33, s29
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v30
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v31
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v32
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v33
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v8
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v13
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -4137,44 +4057,38 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; GISEL-NEXT:    s_mov_b32 s13, s1
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[12:13]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s20
-; GISEL-NEXT:    v_mov_b32_e32 v26, v8
-; GISEL-NEXT:    v_mov_b32_e32 v27, v9
-; GISEL-NEXT:    v_mov_b32_e32 v28, v10
-; GISEL-NEXT:    v_mov_b32_e32 v29, v11
-; GISEL-NEXT:    v_mov_b32_e32 v30, v12
-; GISEL-NEXT:    v_mov_b32_e32 v31, v13
-; GISEL-NEXT:    v_mov_b32_e32 v17, s21
-; GISEL-NEXT:    v_mov_b32_e32 v18, s22
-; GISEL-NEXT:    v_mov_b32_e32 v19, s23
-; GISEL-NEXT:    v_mov_b32_e32 v20, s24
-; GISEL-NEXT:    v_mov_b32_e32 v21, s25
-; GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GISEL-NEXT:    v_mov_b32_e32 v23, s27
-; GISEL-NEXT:    v_mov_b32_e32 v24, s28
-; GISEL-NEXT:    v_mov_b32_e32 v25, s29
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, v31
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-NEXT:    v_mov_b32_e32 v24, s20
+; GISEL-NEXT:    v_mov_b32_e32 v25, s21
+; GISEL-NEXT:    v_mov_b32_e32 v26, s22
+; GISEL-NEXT:    v_mov_b32_e32 v27, s23
+; GISEL-NEXT:    v_mov_b32_e32 v28, s24
+; GISEL-NEXT:    v_mov_b32_e32 v29, s25
+; GISEL-NEXT:    v_mov_b32_e32 v30, s26
+; GISEL-NEXT:    v_mov_b32_e32 v31, s27
+; GISEL-NEXT:    v_mov_b32_e32 v32, s28
+; GISEL-NEXT:    v_mov_b32_e32 v33, s29
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v29
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v30
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v31
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v32
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v33
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v13
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 3
@@ -6204,4 +6118,4 @@ declare <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8
 
 attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
 attributes #1 = { "amdgpu-flat-work-group-size"="128,128" }
-attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
\ No newline at end of file
+attributes #2 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index de528d7259d7b..77d4aad5f3174 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -72,94 +72,46 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_f16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_f16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -200,11 +152,13 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s25
+; GISEL-NEXT:    v_mov_b32_e32 v2, s26
+; GISEL-NEXT:    v_mov_b32_e32 v3, s27
 ; GISEL-NEXT:    v_mov_b32_e32 v16, s28
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16
@@ -298,42 +252,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16:
@@ -351,14 +288,22 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -370,42 +315,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
@@ -423,14 +351,22 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -442,42 +378,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
@@ -495,14 +414,22 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -514,108 +441,94 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v28, s0
-; SDAG-NEXT:    v_mov_b32_e32 v29, s1
-; SDAG-NEXT:    v_mov_b32_e32 v30, s2
-; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v36, s0
+; SDAG-NEXT:    v_mov_b32_e32 v37, s1
+; SDAG-NEXT:    v_mov_b32_e32 v38, s2
+; SDAG-NEXT:    v_mov_b32_e32 v39, s3
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v28, s16
+; SDAG-NEXT:    v_mov_b32_e32 v29, s17
+; SDAG-NEXT:    v_mov_b32_e32 v30, s18
+; SDAG-NEXT:    v_mov_b32_e32 v31, s19
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[28:31], v[0:7], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v11, v0
+; GISEL-NEXT:    v_mov_b32_e32 v12, v1
+; GISEL-NEXT:    v_mov_b32_e32 v13, v2
+; GISEL-NEXT:    v_mov_b32_e32 v14, v3
+; GISEL-NEXT:    v_mov_b32_e32 v15, v4
+; GISEL-NEXT:    v_mov_b32_e32 v16, v5
+; GISEL-NEXT:    v_mov_b32_e32 v17, v6
+; GISEL-NEXT:    v_mov_b32_e32 v18, v7
+; GISEL-NEXT:    v_mov_b32_e32 v19, v8
+; GISEL-NEXT:    v_mov_b32_e32 v20, v9
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-NEXT:    v_mov_b32_e32 v21, v10
+; GISEL-NEXT:    v_mov_b32_e32 v0, s24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s25
+; GISEL-NEXT:    v_mov_b32_e32 v2, s26
+; GISEL-NEXT:    v_mov_b32_e32 v3, s27
+; GISEL-NEXT:    v_mov_b32_e32 v4, s28
+; GISEL-NEXT:    v_mov_b32_e32 v5, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-NEXT:    v_mov_b32_e32 v6, v11
+; GISEL-NEXT:    v_mov_b32_e32 v7, v12
+; GISEL-NEXT:    v_mov_b32_e32 v8, v13
+; GISEL-NEXT:    v_mov_b32_e32 v9, v14
+; GISEL-NEXT:    v_mov_b32_e32 v10, v15
+; GISEL-NEXT:    v_mov_b32_e32 v11, v16
+; GISEL-NEXT:    v_mov_b32_e32 v12, v17
+; GISEL-NEXT:    v_mov_b32_e32 v13, v18
+; GISEL-NEXT:    v_mov_b32_e32 v14, v19
+; GISEL-NEXT:    v_mov_b32_e32 v15, v20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[34:37], v[48:55], v16
+; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -666,17 +579,12 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo
 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
@@ -686,17 +594,12 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1
 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
@@ -706,17 +609,12 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1
 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
@@ -807,42 +705,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -852,42 +733,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
@@ -897,42 +761,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
@@ -942,70 +789,54 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v28, s0
-; GCN-NEXT:    v_mov_b32_e32 v29, s1
-; GCN-NEXT:    v_mov_b32_e32 v30, s2
-; GCN-NEXT:    v_mov_b32_e32 v31, s3
-; GCN-NEXT:    v_mov_b32_e32 v12, s24
-; GCN-NEXT:    v_mov_b32_e32 v27, v9
-; GCN-NEXT:    v_mov_b32_e32 v26, v8
-; GCN-NEXT:    v_mov_b32_e32 v25, v7
-; GCN-NEXT:    v_mov_b32_e32 v24, v6
-; GCN-NEXT:    v_mov_b32_e32 v23, v5
-; GCN-NEXT:    v_mov_b32_e32 v22, v4
-; GCN-NEXT:    v_mov_b32_e32 v21, v3
-; GCN-NEXT:    v_mov_b32_e32 v20, v2
-; GCN-NEXT:    v_mov_b32_e32 v19, v1
-; GCN-NEXT:    v_mov_b32_e32 v18, v0
+; GCN-NEXT:    v_mov_b32_e32 v36, s0
+; GCN-NEXT:    v_mov_b32_e32 v37, s1
+; GCN-NEXT:    v_mov_b32_e32 v38, s2
+; GCN-NEXT:    v_mov_b32_e32 v39, s3
 ; GCN-NEXT:    v_mov_b32_e32 v13, s25
 ; GCN-NEXT:    v_mov_b32_e32 v14, s26
 ; GCN-NEXT:    v_mov_b32_e32 v15, s27
 ; GCN-NEXT:    v_mov_b32_e32 v16, s28
 ; GCN-NEXT:    v_mov_b32_e32 v17, s29
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    v_mov_b32_e32 v1, s17
-; GCN-NEXT:    v_mov_b32_e32 v2, s18
-; GCN-NEXT:    v_mov_b32_e32 v3, s19
-; GCN-NEXT:    v_mov_b32_e32 v4, s20
-; GCN-NEXT:    v_mov_b32_e32 v5, s21
-; GCN-NEXT:    v_mov_b32_e32 v6, s22
-; GCN-NEXT:    v_mov_b32_e32 v7, s23
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    v_mov_b32_e32 v28, s16
+; GCN-NEXT:    v_mov_b32_e32 v29, s17
+; GCN-NEXT:    v_mov_b32_e32 v30, s18
+; GCN-NEXT:    v_mov_b32_e32 v31, s19
+; GCN-NEXT:    v_mov_b32_e32 v32, s20
+; GCN-NEXT:    v_mov_b32_e32 v33, s21
+; GCN-NEXT:    v_mov_b32_e32 v34, s22
+; GCN-NEXT:    v_mov_b32_e32 v35, s23
+; GCN-NEXT:    v_mov_b32_e32 v12, s24
+; GCN-NEXT:    v_mov_b32_e32 v18, v0
+; GCN-NEXT:    v_mov_b32_e32 v19, v1
+; GCN-NEXT:    v_mov_b32_e32 v20, v2
+; GCN-NEXT:    v_mov_b32_e32 v21, v3
+; GCN-NEXT:    v_mov_b32_e32 v22, v4
+; GCN-NEXT:    v_mov_b32_e32 v23, v5
+; GCN-NEXT:    v_mov_b32_e32 v24, v6
+; GCN-NEXT:    v_mov_b32_e32 v25, v7
+; GCN-NEXT:    v_mov_b32_e32 v26, v8
+; GCN-NEXT:    v_mov_b32_e32 v27, v9
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -1085,46 +916,73 @@ bb:
 }
 
 define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_i32_16x16x128_i8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_i32_16x16x128_i8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_i32_16x16x128_i8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x i32> %result
 }
 
 define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
+; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
+; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x i32> inreg %arg2, i32 inreg %arg3) {
+; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v8, s0
+; SDAG-NEXT:    v_mov_b32_e32 v9, s1
+; SDAG-NEXT:    v_mov_b32_e32 v10, s2
+; SDAG-NEXT:    v_mov_b32_e32 v11, s3
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    v_mov_b32_e32 v1, s17
+; SDAG-NEXT:    v_mov_b32_e32 v2, s18
+; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s20
+; SDAG-NEXT:    v_mov_b32_e32 v5, s21
+; SDAG-NEXT:    v_mov_b32_e32 v6, s22
+; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
+; SDAG-NEXT:    v_mov_b32_e32 v12, s28
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
 ; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
@@ -1132,92 +990,19 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
-  ret <4 x i32> %result
-}
-
-define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
-  ret <4 x i32> %result
-}
-
-define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x i32> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v8, s0
-; SDAG-NEXT:    v_mov_b32_e32 v9, s1
-; SDAG-NEXT:    v_mov_b32_e32 v10, s2
-; SDAG-NEXT:    v_mov_b32_e32 v11, s3
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v12, s28
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[8:11], v[0:7], v12
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
+; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s25
+; GISEL-NEXT:    v_mov_b32_e32 v2, s26
+; GISEL-NEXT:    v_mov_b32_e32 v3, s27
 ; GISEL-NEXT:    v_mov_b32_e32 v16, s28
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16
@@ -1317,42 +1102,25 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8:
@@ -1370,14 +1138,22 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1389,42 +1165,25 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
@@ -1442,14 +1201,22 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1461,42 +1228,25 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
@@ -1514,14 +1264,22 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1533,108 +1291,94 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v28, s0
-; SDAG-NEXT:    v_mov_b32_e32 v29, s1
-; SDAG-NEXT:    v_mov_b32_e32 v30, s2
-; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v36, s0
+; SDAG-NEXT:    v_mov_b32_e32 v37, s1
+; SDAG-NEXT:    v_mov_b32_e32 v38, s2
+; SDAG-NEXT:    v_mov_b32_e32 v39, s3
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v28, s16
+; SDAG-NEXT:    v_mov_b32_e32 v29, s17
+; SDAG-NEXT:    v_mov_b32_e32 v30, s18
+; SDAG-NEXT:    v_mov_b32_e32 v31, s19
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[28:31], v[0:7], v10
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v11, v0
+; GISEL-NEXT:    v_mov_b32_e32 v12, v1
+; GISEL-NEXT:    v_mov_b32_e32 v13, v2
+; GISEL-NEXT:    v_mov_b32_e32 v14, v3
+; GISEL-NEXT:    v_mov_b32_e32 v15, v4
+; GISEL-NEXT:    v_mov_b32_e32 v16, v5
+; GISEL-NEXT:    v_mov_b32_e32 v17, v6
+; GISEL-NEXT:    v_mov_b32_e32 v18, v7
+; GISEL-NEXT:    v_mov_b32_e32 v19, v8
+; GISEL-NEXT:    v_mov_b32_e32 v20, v9
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-NEXT:    v_mov_b32_e32 v21, v10
+; GISEL-NEXT:    v_mov_b32_e32 v0, s24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s25
+; GISEL-NEXT:    v_mov_b32_e32 v2, s26
+; GISEL-NEXT:    v_mov_b32_e32 v3, s27
+; GISEL-NEXT:    v_mov_b32_e32 v4, s28
+; GISEL-NEXT:    v_mov_b32_e32 v5, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-NEXT:    v_mov_b32_e32 v6, v11
+; GISEL-NEXT:    v_mov_b32_e32 v7, v12
+; GISEL-NEXT:    v_mov_b32_e32 v8, v13
+; GISEL-NEXT:    v_mov_b32_e32 v9, v14
+; GISEL-NEXT:    v_mov_b32_e32 v10, v15
+; GISEL-NEXT:    v_mov_b32_e32 v11, v16
+; GISEL-NEXT:    v_mov_b32_e32 v12, v17
+; GISEL-NEXT:    v_mov_b32_e32 v13, v18
+; GISEL-NEXT:    v_mov_b32_e32 v14, v19
+; GISEL-NEXT:    v_mov_b32_e32 v15, v20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[34:37], v[48:55], v16
+; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x i32> %result
@@ -1714,94 +1458,46 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -1842,11 +1538,13 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s25
+; GISEL-NEXT:    v_mov_b32_e32 v2, s26
+; GISEL-NEXT:    v_mov_b32_e32 v3, s27
 ; GISEL-NEXT:    v_mov_b32_e32 v16, s28
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16
@@ -1929,94 +1627,46 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -2057,11 +1707,13 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s25
+; GISEL-NEXT:    v_mov_b32_e32 v2, s26
+; GISEL-NEXT:    v_mov_b32_e32 v3, s27
 ; GISEL-NEXT:    v_mov_b32_e32 v16, s28
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
@@ -2144,94 +1796,46 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -2272,11 +1876,13 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s25
+; GISEL-NEXT:    v_mov_b32_e32 v2, s26
+; GISEL-NEXT:    v_mov_b32_e32 v3, s27
 ; GISEL-NEXT:    v_mov_b32_e32 v16, s28
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
@@ -2359,94 +1965,46 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
-  ret <4 x float> %result
-}
-
-define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
+; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -2487,11 +2045,13 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[20:21]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, s24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s25
+; GISEL-NEXT:    v_mov_b32_e32 v2, s26
+; GISEL-NEXT:    v_mov_b32_e32 v3, s27
 ; GISEL-NEXT:    v_mov_b32_e32 v16, s28
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
@@ -2591,42 +2151,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
@@ -2644,14 +2187,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2663,42 +2214,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
@@ -2716,14 +2250,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2735,42 +2277,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
@@ -2788,14 +2313,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2807,108 +2340,94 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v28, s0
-; SDAG-NEXT:    v_mov_b32_e32 v29, s1
-; SDAG-NEXT:    v_mov_b32_e32 v30, s2
-; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v36, s0
+; SDAG-NEXT:    v_mov_b32_e32 v37, s1
+; SDAG-NEXT:    v_mov_b32_e32 v38, s2
+; SDAG-NEXT:    v_mov_b32_e32 v39, s3
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v28, s16
+; SDAG-NEXT:    v_mov_b32_e32 v29, s17
+; SDAG-NEXT:    v_mov_b32_e32 v30, s18
+; SDAG-NEXT:    v_mov_b32_e32 v31, s19
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[28:31], v[0:7], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v11, v0
+; GISEL-NEXT:    v_mov_b32_e32 v12, v1
+; GISEL-NEXT:    v_mov_b32_e32 v13, v2
+; GISEL-NEXT:    v_mov_b32_e32 v14, v3
+; GISEL-NEXT:    v_mov_b32_e32 v15, v4
+; GISEL-NEXT:    v_mov_b32_e32 v16, v5
+; GISEL-NEXT:    v_mov_b32_e32 v17, v6
+; GISEL-NEXT:    v_mov_b32_e32 v18, v7
+; GISEL-NEXT:    v_mov_b32_e32 v19, v8
+; GISEL-NEXT:    v_mov_b32_e32 v20, v9
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-NEXT:    v_mov_b32_e32 v21, v10
+; GISEL-NEXT:    v_mov_b32_e32 v0, s24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s25
+; GISEL-NEXT:    v_mov_b32_e32 v2, s26
+; GISEL-NEXT:    v_mov_b32_e32 v3, s27
+; GISEL-NEXT:    v_mov_b32_e32 v4, s28
+; GISEL-NEXT:    v_mov_b32_e32 v5, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-NEXT:    v_mov_b32_e32 v6, v11
+; GISEL-NEXT:    v_mov_b32_e32 v7, v12
+; GISEL-NEXT:    v_mov_b32_e32 v8, v13
+; GISEL-NEXT:    v_mov_b32_e32 v9, v14
+; GISEL-NEXT:    v_mov_b32_e32 v10, v15
+; GISEL-NEXT:    v_mov_b32_e32 v11, v16
+; GISEL-NEXT:    v_mov_b32_e32 v12, v17
+; GISEL-NEXT:    v_mov_b32_e32 v13, v18
+; GISEL-NEXT:    v_mov_b32_e32 v14, v19
+; GISEL-NEXT:    v_mov_b32_e32 v15, v20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[34:37], v[48:55], v16
+; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -3005,42 +2524,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
@@ -3058,14 +2560,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3077,42 +2587,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
@@ -3130,14 +2623,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3149,42 +2650,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
@@ -3202,14 +2686,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3221,108 +2713,94 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v28, s0
-; SDAG-NEXT:    v_mov_b32_e32 v29, s1
-; SDAG-NEXT:    v_mov_b32_e32 v30, s2
-; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v36, s0
+; SDAG-NEXT:    v_mov_b32_e32 v37, s1
+; SDAG-NEXT:    v_mov_b32_e32 v38, s2
+; SDAG-NEXT:    v_mov_b32_e32 v39, s3
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v28, s16
+; SDAG-NEXT:    v_mov_b32_e32 v29, s17
+; SDAG-NEXT:    v_mov_b32_e32 v30, s18
+; SDAG-NEXT:    v_mov_b32_e32 v31, s19
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[28:31], v[0:7], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v11, v0
+; GISEL-NEXT:    v_mov_b32_e32 v12, v1
+; GISEL-NEXT:    v_mov_b32_e32 v13, v2
+; GISEL-NEXT:    v_mov_b32_e32 v14, v3
+; GISEL-NEXT:    v_mov_b32_e32 v15, v4
+; GISEL-NEXT:    v_mov_b32_e32 v16, v5
+; GISEL-NEXT:    v_mov_b32_e32 v17, v6
+; GISEL-NEXT:    v_mov_b32_e32 v18, v7
+; GISEL-NEXT:    v_mov_b32_e32 v19, v8
+; GISEL-NEXT:    v_mov_b32_e32 v20, v9
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-NEXT:    v_mov_b32_e32 v21, v10
+; GISEL-NEXT:    v_mov_b32_e32 v0, s24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s25
+; GISEL-NEXT:    v_mov_b32_e32 v2, s26
+; GISEL-NEXT:    v_mov_b32_e32 v3, s27
+; GISEL-NEXT:    v_mov_b32_e32 v4, s28
+; GISEL-NEXT:    v_mov_b32_e32 v5, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-NEXT:    v_mov_b32_e32 v6, v11
+; GISEL-NEXT:    v_mov_b32_e32 v7, v12
+; GISEL-NEXT:    v_mov_b32_e32 v8, v13
+; GISEL-NEXT:    v_mov_b32_e32 v9, v14
+; GISEL-NEXT:    v_mov_b32_e32 v10, v15
+; GISEL-NEXT:    v_mov_b32_e32 v11, v16
+; GISEL-NEXT:    v_mov_b32_e32 v12, v17
+; GISEL-NEXT:    v_mov_b32_e32 v13, v18
+; GISEL-NEXT:    v_mov_b32_e32 v14, v19
+; GISEL-NEXT:    v_mov_b32_e32 v15, v20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[34:37], v[48:55], v16
+; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -3419,42 +2897,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
@@ -3472,14 +2933,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3491,42 +2960,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
@@ -3544,14 +2996,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3563,42 +3023,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
@@ -3616,14 +3059,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3635,108 +3086,94 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v28, s0
-; SDAG-NEXT:    v_mov_b32_e32 v29, s1
-; SDAG-NEXT:    v_mov_b32_e32 v30, s2
-; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v36, s0
+; SDAG-NEXT:    v_mov_b32_e32 v37, s1
+; SDAG-NEXT:    v_mov_b32_e32 v38, s2
+; SDAG-NEXT:    v_mov_b32_e32 v39, s3
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v28, s16
+; SDAG-NEXT:    v_mov_b32_e32 v29, s17
+; SDAG-NEXT:    v_mov_b32_e32 v30, s18
+; SDAG-NEXT:    v_mov_b32_e32 v31, s19
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[28:31], v[0:7], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v11, v0
+; GISEL-NEXT:    v_mov_b32_e32 v12, v1
+; GISEL-NEXT:    v_mov_b32_e32 v13, v2
+; GISEL-NEXT:    v_mov_b32_e32 v14, v3
+; GISEL-NEXT:    v_mov_b32_e32 v15, v4
+; GISEL-NEXT:    v_mov_b32_e32 v16, v5
+; GISEL-NEXT:    v_mov_b32_e32 v17, v6
+; GISEL-NEXT:    v_mov_b32_e32 v18, v7
+; GISEL-NEXT:    v_mov_b32_e32 v19, v8
+; GISEL-NEXT:    v_mov_b32_e32 v20, v9
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-NEXT:    v_mov_b32_e32 v21, v10
+; GISEL-NEXT:    v_mov_b32_e32 v0, s24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s25
+; GISEL-NEXT:    v_mov_b32_e32 v2, s26
+; GISEL-NEXT:    v_mov_b32_e32 v3, s27
+; GISEL-NEXT:    v_mov_b32_e32 v4, s28
+; GISEL-NEXT:    v_mov_b32_e32 v5, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-NEXT:    v_mov_b32_e32 v6, v11
+; GISEL-NEXT:    v_mov_b32_e32 v7, v12
+; GISEL-NEXT:    v_mov_b32_e32 v8, v13
+; GISEL-NEXT:    v_mov_b32_e32 v9, v14
+; GISEL-NEXT:    v_mov_b32_e32 v10, v15
+; GISEL-NEXT:    v_mov_b32_e32 v11, v16
+; GISEL-NEXT:    v_mov_b32_e32 v12, v17
+; GISEL-NEXT:    v_mov_b32_e32 v13, v18
+; GISEL-NEXT:    v_mov_b32_e32 v14, v19
+; GISEL-NEXT:    v_mov_b32_e32 v15, v20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[34:37], v[48:55], v16
+; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -3833,42 +3270,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
@@ -3886,14 +3306,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3905,42 +3333,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
@@ -3958,14 +3369,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3977,42 +3396,25 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
@@ -4030,14 +3432,22 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -4049,108 +3459,94 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v28, s0
-; SDAG-NEXT:    v_mov_b32_e32 v29, s1
-; SDAG-NEXT:    v_mov_b32_e32 v30, s2
-; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v36, s0
+; SDAG-NEXT:    v_mov_b32_e32 v37, s1
+; SDAG-NEXT:    v_mov_b32_e32 v38, s2
+; SDAG-NEXT:    v_mov_b32_e32 v39, s3
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_mov_b32_e32 v28, s16
+; SDAG-NEXT:    v_mov_b32_e32 v29, s17
+; SDAG-NEXT:    v_mov_b32_e32 v30, s18
+; SDAG-NEXT:    v_mov_b32_e32 v31, s19
+; SDAG-NEXT:    v_mov_b32_e32 v32, s20
+; SDAG-NEXT:    v_mov_b32_e32 v33, s21
+; SDAG-NEXT:    v_mov_b32_e32 v34, s22
+; SDAG-NEXT:    v_mov_b32_e32 v35, s23
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[28:31], v[0:7], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v11, v0
+; GISEL-NEXT:    v_mov_b32_e32 v12, v1
+; GISEL-NEXT:    v_mov_b32_e32 v13, v2
+; GISEL-NEXT:    v_mov_b32_e32 v14, v3
+; GISEL-NEXT:    v_mov_b32_e32 v15, v4
+; GISEL-NEXT:    v_mov_b32_e32 v16, v5
+; GISEL-NEXT:    v_mov_b32_e32 v17, v6
+; GISEL-NEXT:    v_mov_b32_e32 v18, v7
+; GISEL-NEXT:    v_mov_b32_e32 v19, v8
+; GISEL-NEXT:    v_mov_b32_e32 v20, v9
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-NEXT:    v_mov_b32_e32 v21, v10
+; GISEL-NEXT:    v_mov_b32_e32 v0, s24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s25
+; GISEL-NEXT:    v_mov_b32_e32 v2, s26
+; GISEL-NEXT:    v_mov_b32_e32 v3, s27
+; GISEL-NEXT:    v_mov_b32_e32 v4, s28
+; GISEL-NEXT:    v_mov_b32_e32 v5, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-NEXT:    v_mov_b32_e32 v6, v11
+; GISEL-NEXT:    v_mov_b32_e32 v7, v12
+; GISEL-NEXT:    v_mov_b32_e32 v8, v13
+; GISEL-NEXT:    v_mov_b32_e32 v9, v14
+; GISEL-NEXT:    v_mov_b32_e32 v10, v15
+; GISEL-NEXT:    v_mov_b32_e32 v11, v16
+; GISEL-NEXT:    v_mov_b32_e32 v12, v17
+; GISEL-NEXT:    v_mov_b32_e32 v13, v18
+; GISEL-NEXT:    v_mov_b32_e32 v14, v19
+; GISEL-NEXT:    v_mov_b32_e32 v15, v20
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[34:37], v[48:55], v16
+; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index 54b535ca43126..f76580b94e13c 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -5,6 +5,8 @@
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GREEDY,GREEDY90A-GISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,FAST %s
 
+; This is better with 90a
+
 ; Check that Dst and SrcC of MFMA instructions reading more than 4 registers as SrcC
 ; is either completely disjoint or exactly the same, but does not alias.
 
@@ -15,8 +17,8 @@ declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i3
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32:
 ; GREEDY: v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
 ; GREEDY: v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
-; FAST:   v_mfma_f32_32x32x1{{.*}} a[64:95], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95]
-; FAST:   v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[64:95]
+; FAST:   v_mfma_f32_32x32x1{{.*}} a[32:63], v{{[0-9]+}}, v{{[0-9]+}}, a[32:63]
+; FAST:   v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:63]
 ; GCN:    v_mfma_f32_32x32x1{{.*}} a[0:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:31]
 define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 bb:
@@ -33,14 +35,14 @@ bb:
 ; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[18:33], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33]
 ; GREEDY908: v_mfma_f32_16x16x1{{.*}} a[2:17], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33]
 
-; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31]
-; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[16:31]
+; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[18:33], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33]
+; GREEDY90A: v_mfma_f32_16x16x1{{.*}} a[2:17], v{{[0-9]+}}, v{{[0-9]+}}, a[18:33]
 
 ; GREEDY90A-GISEL: v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
 ; GREEDY90A-GISEL: v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
 
-; FAST:   v_mfma_f32_16x16x1{{.*}} a[32:47], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
-; FAST:   v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[32:47]
+; FAST:   v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
+; FAST:   v_mfma_f32_16x16x1{{.*}} a[16:31], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
 ; GCN:    v_mfma_f32_16x16x1{{.*}} a[0:15], v{{[0-9]+}}, v{{[0-9]+}}, a[0:15]
 define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 bb:
@@ -58,8 +60,8 @@ bb:
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32:
 ; GREEDY: v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
 ; GREEDY: v_mfma_f32_4x4x1{{.*}} a[2:5], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
-; FAST:   v_mfma_f32_4x4x1{{.*}} a[8:11], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
-; FAST:   v_mfma_f32_4x4x1{{.*}} a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[8:11]
+; FAST:   v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
+; FAST:   v_mfma_f32_4x4x1{{.*}} a[4:7], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
 ; GCN:    v_mfma_f32_4x4x1{{.*}} a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
 define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index dad59daaefb5f..3844d6054e130 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -6,32 +6,29 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX942-LABEL: matmul_kernel:
 ; GFX942:       ; %bb.0: ; %entry
 ; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, v0
 ; GFX942-NEXT:    s_mov_b32 s2, 0
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v1
-; GFX942-NEXT:    s_mov_b32 s3, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
 ; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX942-NEXT:    s_mov_b32 s3, 0
 ; GFX942-NEXT:    s_branch .LBB0_2
 ; GFX942-NEXT:  .LBB0_1: ; %bb2
 ; GFX942-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; GFX942-NEXT:    s_or_b32 s4, s3, 1
 ; GFX942-NEXT:    s_ashr_i32 s5, s3, 31
 ; GFX942-NEXT:    s_mov_b32 s3, s2
-; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
-; GFX942-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX942-NEXT:    v_mov_b32_e32 v2, v1
-; GFX942-NEXT:    v_mov_b32_e32 v3, v1
-; GFX942-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX942-NEXT:    v_accvgpr_write_b32 a1, v1
-; GFX942-NEXT:    v_accvgpr_write_b32 a2, v2
-; GFX942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_accvgpr_mov_b32 a0, a2
+; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a1
+; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
 ; GFX942-NEXT:    s_and_b32 s3, s5, s4
 ; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[4:5], a[0:3]
+; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3]
 ; GFX942-NEXT:    s_cbranch_execz .LBB0_4
 ; GFX942-NEXT:  .LBB0_2: ; %bb
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -39,40 +36,40 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX942-NEXT:    s_cbranch_vccz .LBB0_1
 ; GFX942-NEXT:  ; %bb.3:
 ; GFX942-NEXT:    ; implicit-def: $sgpr3
-; GFX942-NEXT:    ; implicit-def: $agpr0
+; GFX942-NEXT:    ; implicit-def: $agpr2
 ; GFX942-NEXT:  .LBB0_4: ; %common.ret
 ; GFX942-NEXT:    s_endpgm
 ;
 ; GFX908-LABEL: matmul_kernel:
 ; GFX908:       ; %bb.0: ; %entry
 ; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX908-NEXT:    v_mov_b32_e32 v1, 0
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, 0
 ; GFX908-NEXT:    s_mov_b32 s2, 0
-; GFX908-NEXT:    s_mov_b32 s3, 0
-; GFX908-NEXT:    v_accvgpr_write_b32 a0, v1
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX908-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX908-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX908-NEXT:    s_mov_b32 s3, 0
 ; GFX908-NEXT:    s_branch .LBB0_2
 ; GFX908-NEXT:  .LBB0_1: ; %bb2
 ; GFX908-NEXT:    ; in Loop: Header=BB0_2 Depth=1
 ; GFX908-NEXT:    s_or_b32 s4, s3, 1
 ; GFX908-NEXT:    s_ashr_i32 s5, s3, 31
 ; GFX908-NEXT:    s_mov_b32 s3, s2
-; GFX908-NEXT:    s_nop 3
-; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
-; GFX908-NEXT:    v_mov_b32_e32 v5, s3
-; GFX908-NEXT:    v_mov_b32_e32 v4, s2
-; GFX908-NEXT:    v_mov_b32_e32 v2, v1
-; GFX908-NEXT:    v_mov_b32_e32 v3, v1
+; GFX908-NEXT:    v_mov_b32_e32 v1, s2
+; GFX908-NEXT:    s_nop 2
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a2
+; GFX908-NEXT:    v_mov_b32_e32 v2, s3
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a1
 ; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
-; GFX908-NEXT:    v_accvgpr_write_b32 a1, v1
-; GFX908-NEXT:    v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v4
 ; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
 ; GFX908-NEXT:    s_and_b32 s3, s5, s4
-; GFX908-NEXT:    v_mfma_f32_16x16x16f16 a[0:3], v[4:5], v[4:5], a[0:3]
+; GFX908-NEXT:    v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3]
 ; GFX908-NEXT:    s_cbranch_execz .LBB0_4
 ; GFX908-NEXT:  .LBB0_2: ; %bb
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -80,7 +77,7 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX908-NEXT:    s_cbranch_vccz .LBB0_1
 ; GFX908-NEXT:  ; %bb.3:
 ; GFX908-NEXT:    ; implicit-def: $sgpr3
-; GFX908-NEXT:    ; implicit-def: $agpr0
+; GFX908-NEXT:    ; implicit-def: $agpr2
 ; GFX908-NEXT:  .LBB0_4: ; %common.ret
 ; GFX908-NEXT:    s_endpgm
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
index 5c83170563e59..ee5481617cf59 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
@@ -45,8 +45,11 @@ body:             |
   ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
   ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:areg_128_align2 = COPY [[REG_SEQUENCE]]
-  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY4]], [[COPY4]], killed [[COPY5]], 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:areg_128_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub3
+  ; CHECK-NEXT:   [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY4]], [[COPY4]], killed [[REG_SEQUENCE2]], 0, 0, 0, implicit $mode, implicit $exec
   ; CHECK-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -77,16 +80,16 @@ body:             |
   ; COALESCE-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
   ; COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
   ; COALESCE-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; COALESCE-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec
   ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
   ; COALESCE-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
   ; COALESCE-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
+  ; COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
   ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; COALESCE-NEXT: {{  $}}
   ; COALESCE-NEXT: bb.1:
   ; COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; COALESCE-NEXT: {{  $}}
-  ; COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
   ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
   ; COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
   ; COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
@@ -98,12 +101,12 @@ body:             |
   ; COALESCE-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc
   ; COALESCE-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
   ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
-  ; COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
-  ; COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
   ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
-  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
-  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]]
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
+  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
+  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
+  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
+  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
   ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; COALESCE-NEXT: {{  $}}
   ; COALESCE-NEXT: bb.3:
@@ -131,16 +134,16 @@ body:             |
   ; GFX908-COALESCE-NEXT:   S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc
   ; GFX908-COALESCE-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0
   ; GFX908-COALESCE-NEXT:   [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc
-  ; GFX908-COALESCE-NEXT:   undef [[V_MOV_B32_e32_:%[0-9]+]].sub1:vreg_128_align2 = V_MOV_B32_e32 0, implicit $exec
   ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec
   ; GFX908-COALESCE-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
+  ; GFX908-COALESCE-NEXT:   undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GFX908-COALESCE-NEXT: {{  $}}
   ; GFX908-COALESCE-NEXT: bb.1:
   ; GFX908-COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX908-COALESCE-NEXT: {{  $}}
-  ; GFX908-COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub0:vreg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
   ; GFX908-COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
   ; GFX908-COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
@@ -152,12 +155,12 @@ body:             |
   ; GFX908-COALESCE-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_MOV_B32_1]], 1, implicit-def dead $scc
   ; GFX908-COALESCE-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
-  ; GFX908-COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub2:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
-  ; GFX908-COALESCE-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]].sub3:vreg_128_align2 = COPY [[V_MOV_B32_e32_]].sub1
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
-  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
-  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:areg_128_align2 = COPY [[V_MOV_B32_e32_]]
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; GFX908-COALESCE-NEXT: {{  $}}
   ; GFX908-COALESCE-NEXT: bb.3:

diff  --git a/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll b/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll
index 9279b44edac75..8383930ebfcd1 100644
--- a/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/remaining-virtual-register-operands.ll
@@ -1,4 +1,6 @@
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs -enable-misched=0 -filetype=null %s 2>&1 | FileCheck -implicit-check-not=error %s
+
+; Scheduler disabled to work around issue #129028
 
 ; This testcase fails register allocation at the same time it performs
 ; virtual register splitting (by introducing VGPR to AGPR copies). We
@@ -13,13 +15,16 @@
 ; CHECK: error: <unknown>:0:0: ran out of registers during register allocation
 define amdgpu_kernel void @alloc_failure_with_split_vregs(float %v0, float %v1) #0 {
   %agpr0 = call float asm sideeffect "; def $0", "=${a0}"()
-  %agpr.vec = insertelement <16 x float> undef, float %agpr0, i32 0
+  %agpr.vec = insertelement <16 x float> zeroinitializer, float %agpr0, i32 0
   %mfma0 = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %agpr.vec, i32 0, i32 0, i32 0)
   %mfma0.3 = extractelement <16 x float> %mfma0, i32 3
   %insert = insertelement <16 x float> %mfma0, float %agpr0, i32 8
+
   %mfma1 = call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float %v0, float %v1, <16 x float> %insert, i32 0, i32 0, i32 0)
   %mfma1.3 = extractelement <16 x float> %mfma1, i32 3
   call void asm sideeffect "; use $0", "{a1}"(float %mfma1.3)
+  call void asm sideeffect "; use $0", "a"(<16 x float> %agpr.vec)
+
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll b/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
index a7e361b4b67cc..a703ce0f6064c 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-xdl-resource.ll
@@ -3,9 +3,8 @@
 
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32)
 
-; CHECK: CritRes: {{[0-9]+}} HWXDL
-; CHECK: Picking: Cand SU([[nid:[0-9]+]]) RES-DEMAND
-; CHECK: Scheduling SU([[nid]]) {{.*}} V_MFMA_F32_32X32X4F16
+; CHECK: Scheduling SU({{[0-9]+}}) {{.*}} V_MFMA_F32_32X32X4F16
+; CHECK: HWXDL +16x1u
 define amdgpu_kernel void @schedule-xdl-resource(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(3) %lds, i32 %stride) #0 {
   %in_ptr.1 = getelementptr <32 x float>, ptr addrspace(1) %in, i32 %stride
   %in_ptr.2 = getelementptr <32 x float>, ptr addrspace(1) %in_ptr.1, i32 %stride

diff  --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-agpr-copy-reg-sequence.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-agpr-copy-reg-sequence.mir
index 9d167f578e9eb..80f13b5102097 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-operands-agpr-copy-reg-sequence.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-agpr-copy-reg-sequence.mir
@@ -10,8 +10,12 @@ body:             |
     ; CHECK-LABEL: name: s_mov_b32_0_copy_vgpr_reg_sequence_128_splat_copy_to_agpr
     ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sgpr_32 = S_MOV_B32 0
     %1:vgpr_32 = COPY killed %0
@@ -30,8 +34,12 @@ body:             |
     ; CHECK-LABEL: name: v_mov_b32_0_vgpr_reg_sequence_128_splat_copy_to_agpr
     ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %1:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3
@@ -50,8 +58,12 @@ body:             |
     ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
     ; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]]
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %1:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3
@@ -70,8 +82,8 @@ body:             |
     ; CHECK-LABEL: name: s_mov_b32_literal_copy_vgpr_reg_sequence_128_splat_copy_to_agpr
     ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 999, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sgpr_32 = S_MOV_B32 999
     %1:vgpr_32 = COPY %0
@@ -90,8 +102,8 @@ body:             |
     ; CHECK-LABEL: name: v_mov_b32_literal_vgpr_reg_sequence_128_splat_copy_to_agpr
     ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 999, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:vgpr_32 = V_MOV_B32_e32 999, implicit $exec
     %1:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3
@@ -110,8 +122,8 @@ body:             |
     ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 999, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
     ; CHECK-NEXT: S_NOP 0, implicit [[REG_SEQUENCE]]
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:vgpr_32 = V_MOV_B32_e32 999, implicit $exec
     %1:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3
@@ -205,12 +217,7 @@ body:             |
   bb.0:
     ; CHECK-LABEL: name: s_mov_b32_999_splat_sgpr_128_copy_vgpr_copy_agpr
     ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 999
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:agpr_32 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:agpr_32 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:agpr_32 = COPY [[COPY]]
-    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
     ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sgpr_32 = S_MOV_B32 999
@@ -231,11 +238,8 @@ body:             |
     ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 999
     ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1
     ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:agpr_32 = COPY [[COPY]]
     ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
-    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub3
+    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub3
     ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sgpr_32 = S_MOV_B32 999
@@ -256,8 +260,12 @@ body:             |
     ; CHECK-LABEL: name: s_mov_b32_0_splat_sgpr_128_copy_agpr
     ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sgpr_32 = S_MOV_B32 0
     %1:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3
@@ -277,8 +285,10 @@ body:             |
     ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 999, implicit $exec
     ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1, [[V_MOV_B32_e32_2]], %subreg.sub2, [[V_MOV_B32_e32_1]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 8, implicit $exec
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub2, [[V_MOV_B32_e32_1]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %1:vgpr_32 = V_MOV_B32_e32 999, implicit $exec
@@ -298,8 +308,9 @@ body:             |
     ; CHECK-LABEL: name: s_mov_b64_0_copy_vgpr_reg_sequence_128_splat_copy_to_agpr_elt64
     ; CHECK: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B]], %subreg.sub0_sub1, [[V_MOV_B]], %subreg.sub2_sub3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY [[V_MOV_B]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2_sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sreg_64 = S_MOV_B64 0
     %1:vreg_64_align2 = COPY killed %0
@@ -318,8 +329,9 @@ body:             |
     ; CHECK-LABEL: name: s_mov_b64_0_copy_vgpr_reg_sequence_128_splat_copy_to_agpr_subreg_elt32
     ; CHECK: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B]].sub0, %subreg.sub0, [[V_MOV_B]].sub1, %subreg.sub1, [[V_MOV_B]], %subreg.sub1_sub2
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY [[V_MOV_B]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_MOV_B]].sub0, %subreg.sub0, [[V_MOV_B]].sub1, %subreg.sub1, [[COPY]], %subreg.sub1_sub2
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sreg_64 = S_MOV_B64 0
     %1:vreg_64_align2 = COPY killed %0
@@ -342,8 +354,10 @@ body:             |
     ; CHECK-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2_sub3
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_MOV_B]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sreg_64 = S_MOV_B64 0
     %1:vreg_64_align2 = COPY killed %0
@@ -367,8 +381,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[COPY]]
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY1]], %subreg.sub2_sub3
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY2]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sreg_64 = COPY $sgpr8_sgpr9
     %1:vreg_64_align2 = COPY killed %0
@@ -392,8 +407,10 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY $sgpr10_sgpr11
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3, [[COPY1]], %subreg.sub4_sub5
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:areg_192 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY [[COPY2]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_192 = REG_SEQUENCE [[COPY2]], %subreg.sub0_sub1, [[COPY3]], %subreg.sub2_sub3, [[COPY2]], %subreg.sub4_sub5
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     %0:sreg_64 = COPY $sgpr8_sgpr9
     %1:sreg_64 = COPY $sgpr10_sgpr11
@@ -418,8 +435,10 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr10
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr11
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3, [[COPY2]], %subreg.sub4, [[COPY2]], %subreg.sub5
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_192 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY [[COPY3]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[COPY2]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_192 = REG_SEQUENCE [[COPY3]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2_sub3, [[COPY4]], %subreg.sub4, [[COPY4]], %subreg.sub5
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5
     %0:sreg_64 = COPY $sgpr8_sgpr9
     %1:sreg_32 = COPY $sgpr10
@@ -492,8 +511,11 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr10
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr11
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY]], %subreg.sub3, [[COPY1]], %subreg.sub2, [[COPY2]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:areg_192 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY4]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_192 = REG_SEQUENCE [[COPY4]], %subreg.sub3, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub0, [[COPY6]], %subreg.sub1
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sreg_32 = COPY $sgpr8
     %1:sreg_32 = COPY $sgpr9
@@ -519,8 +541,10 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr9
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY]], %subreg.sub3, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:areg_192 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY2]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_192 = REG_SEQUENCE [[COPY2]], %subreg.sub3, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sreg_32 = COPY $sgpr8
     %1:sreg_32 = COPY $sgpr9
@@ -544,9 +568,11 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr9
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_192 = REG_SEQUENCE [[COPY]], %subreg.sub3, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:areg_192 = COPY [[REG_SEQUENCE]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_192 = REG_SEQUENCE [[COPY2]], %subreg.sub3, [[COPY3]], %subreg.sub2, [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
     ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY2]]
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sreg_32 = COPY $sgpr8
     %1:sreg_32 = COPY $sgpr9
@@ -573,8 +599,10 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr8
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr9
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY4]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY4]], %subreg.sub2, [[COPY5]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -602,8 +630,11 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr9
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr10
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY4]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]]
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY6]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:vgpr_32 = COPY $vgpr0
     %1:sreg_32 = COPY $sgpr8
@@ -631,8 +662,9 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr8
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY4]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY4]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -654,8 +686,12 @@ body:             |
     ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
     ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_3:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub1, [[V_ACCVGPR_WRITE_B32_e64_2]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_3]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sgpr_32 = S_MOV_B32 0
     %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
@@ -675,8 +711,11 @@ body:             |
     ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 999
     ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[V_MOV_B32_e32_]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub1, [[COPY]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_1]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sgpr_32 = S_MOV_B32 999
     %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
@@ -696,8 +735,10 @@ body:             |
     ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 999
     ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub0, [[COPY]], %subreg.sub1, [[COPY]], %subreg.sub2, [[COPY]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sgpr_32 = S_MOV_B32 999
     %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
@@ -717,8 +758,9 @@ body:             |
     ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 8
     ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 123, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY [[REG_SEQUENCE]]
-    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[COPY]]
+    ; CHECK-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 8, implicit $exec
+    ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:areg_128 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_]], %subreg.sub1, [[V_MOV_B32_e32_]], %subreg.sub2, [[V_ACCVGPR_WRITE_B32_e64_]], %subreg.sub3
+    ; CHECK-NEXT: $agpr0_agpr1_agpr2_agpr3 = COPY [[REG_SEQUENCE1]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3
     %0:sgpr_32 = S_MOV_B32 8
     %1:vgpr_32 = V_MOV_B32_e32 123, implicit $exec


        


More information about the llvm-commits mailing list