[llvm] [RegisterCoalescer]: Try inflated RC for coalescing reg->subreg (PR #134438)

Jeffrey Byrnes via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 4 10:36:36 PDT 2025


https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/134438

>From b941f831f8fe737a589bb7f443895de8ec94f3d4 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 25 Mar 2025 15:36:55 -0700
Subject: [PATCH 1/3] [RegisterCoalescer]: Try inflated RC for coalescing
 reg->subreg

Change-Id: Id1123431e9fdcabac2811f5b18abff7d66e2d1f3
---
 .../llvm/CodeGen/MachineRegisterInfo.h        |    9 +-
 llvm/lib/CodeGen/MachineRegisterInfo.cpp      |   16 +-
 llvm/lib/CodeGen/RegisterCoalescer.cpp        |    9 +-
 .../coalesce-copy-to-agpr-to-av-registers.mir |  137 +-
 ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll |  300 +-
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       | 3266 ++++++++++-------
 .../AMDGPU/mfma-no-register-aliasing.ll       |  706 ++--
 7 files changed, 2559 insertions(+), 1884 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 8e288cf212360..5f868efde8d9c 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -731,7 +731,14 @@ class MachineRegisterInfo {
 
   /// recomputeRegClass - Try to find a legal super-class of Reg's register
   /// class that still satisfies the constraints from the instructions using
-  /// Reg.  Returns true if Reg was upgraded.
+  /// \p Reg. \p return the super-class TargetRegisterClass if one was found,
+  /// otherwise \p return the original TargetRegisterClass.
+  const TargetRegisterClass *
+  getLargestConstrainedSuperClass(Register Reg) const;
+
+  /// recomputeRegClass - Try to find a legal super-class of Reg's register
+  /// class that still satisfies the constraints from the instructions using
+  /// \p Reg. \p return true if Reg was upgraded.
   ///
   /// This method can be used after constraints have been removed from a
   /// virtual register, for example after removing instructions or splitting
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 937f63f6c5e00..d483625212c55 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -118,8 +118,8 @@ MachineRegisterInfo::constrainRegAttrs(Register Reg,
   return true;
 }
 
-bool
-MachineRegisterInfo::recomputeRegClass(Register Reg) {
+const TargetRegisterClass *
+MachineRegisterInfo::getLargestConstrainedSuperClass(Register Reg) const {
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   const TargetRegisterClass *OldRC = getRegClass(Reg);
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
@@ -127,7 +127,7 @@ MachineRegisterInfo::recomputeRegClass(Register Reg) {
 
   // Stop early if there is no room to grow.
   if (NewRC == OldRC)
-    return false;
+    return NewRC;
 
   // Accumulate constraints from all uses.
   for (MachineOperand &MO : reg_nodbg_operands(Reg)) {
@@ -136,8 +136,16 @@ MachineRegisterInfo::recomputeRegClass(Register Reg) {
     unsigned OpNo = &MO - &MI->getOperand(0);
     NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, TII, TRI);
     if (!NewRC || NewRC == OldRC)
-      return false;
+      return OldRC;
   }
+  return NewRC;
+}
+
+bool MachineRegisterInfo::recomputeRegClass(Register Reg) {
+  const TargetRegisterClass *OldRC = getRegClass(Reg);
+  const TargetRegisterClass *NewRC = getLargestConstrainedSuperClass(Reg);
+  if (NewRC == OldRC)
+    return false;
   setRegClass(Reg, NewRC);
   return true;
 }
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index c27435aa2dae0..15cb68012d0c9 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -477,7 +477,9 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
     Flipped = true;
   }
 
-  const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+  const MachineFunction *MF = MI->getMF();
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
   const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
 
   if (Dst.isPhysical()) {
@@ -515,6 +517,11 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
       // SrcReg will be merged with a sub-register of DstReg.
       SrcIdx = DstSub;
       NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+      if (!NewRC) {
+        auto SuperDstRC = MRI.getLargestConstrainedSuperClass(Dst);
+        if (SuperDstRC != DstRC)
+          NewRC = TRI.getMatchingSuperRegClass(SuperDstRC, SrcRC, DstSub);
+      }
     } else if (SrcSub) {
       // DstReg will be merged with a sub-register of SrcReg.
       DstIdx = SrcSub;
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index f153b30c80b22..3f1965107b361 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -1440,11 +1440,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_snop
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_64 = COPY $vgpr1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -1465,11 +1463,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_snop
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_64_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_64_align2 = COPY $vgpr1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -1490,13 +1486,10 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_snop
     ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_96 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_96 = COPY $vgpr2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -1519,13 +1512,10 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_snop
     ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_96_align2 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_96_align2 = COPY $vgpr2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -1548,11 +1538,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_snop
     ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:av_128 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:av_128 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
@@ -1573,11 +1561,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_snop
     ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:av_128_align2 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:av_128_align2 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
@@ -1623,11 +1609,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_snop
     ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:av_96 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = COPY $vgpr1_vgpr2
@@ -1648,11 +1632,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_snop
     ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:av_96_align2 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = COPY $vgpr1_vgpr2
@@ -1673,11 +1655,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_snop
     ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:av_96 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_96 = COPY $vgpr2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vgpr_32 = COPY $vgpr2
@@ -1698,11 +1678,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_snop
     ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:av_96_align2 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_96_align2 = COPY $vgpr2
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vgpr_32 = COPY $vgpr2
@@ -1723,10 +1701,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_snop
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %2.sub0:areg_64 = COPY %0
@@ -1746,11 +1723,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2_snop
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_64_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_64_align2 = COPY $vgpr1
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -1771,10 +1746,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_snop
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_96 = COPY %0
@@ -1794,10 +1768,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_snop
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_96_align2 = COPY %0
@@ -1817,12 +1790,11 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_snop
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_128 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub3:av_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128 = COPY %0
@@ -1844,12 +1816,11 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_snop
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_128_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub3:av_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128_align2 = COPY %0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index dac54c9f85e96..1ab7de7cda935 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -3515,42 +3515,56 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v24, s0
-; SDAG-NEXT:    v_mov_b32_e32 v25, s1
-; SDAG-NEXT:    v_mov_b32_e32 v26, s2
-; SDAG-NEXT:    v_mov_b32_e32 v27, s3
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s28
-; SDAG-NEXT:    v_mov_b32_e32 v33, s29
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    v_mov_b32_e32 v20, s24
-; SDAG-NEXT:    v_mov_b32_e32 v21, s25
-; SDAG-NEXT:    v_mov_b32_e32 v22, s26
-; SDAG-NEXT:    v_mov_b32_e32 v23, s27
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v32
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v33
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v0
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v1
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v2
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v3
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v4
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v5
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v6
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v7
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v8
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v9
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v10
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v11
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v13
+; SDAG-NEXT:    v_mov_b32_e32 v32, s0
+; SDAG-NEXT:    v_mov_b32_e32 v33, s1
+; SDAG-NEXT:    v_mov_b32_e32 v34, s2
+; SDAG-NEXT:    v_mov_b32_e32 v35, s3
+; SDAG-NEXT:    v_mov_b32_e32 v36, s16
+; SDAG-NEXT:    v_mov_b32_e32 v37, s17
+; SDAG-NEXT:    v_mov_b32_e32 v38, s18
+; SDAG-NEXT:    v_mov_b32_e32 v39, s19
+; SDAG-NEXT:    v_mov_b32_e32 v16, s28
+; SDAG-NEXT:    v_mov_b32_e32 v31, v13
+; SDAG-NEXT:    v_mov_b32_e32 v30, v12
+; SDAG-NEXT:    v_mov_b32_e32 v29, v11
+; SDAG-NEXT:    v_mov_b32_e32 v28, v10
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v0, s20
+; SDAG-NEXT:    v_mov_b32_e32 v1, s21
+; SDAG-NEXT:    v_mov_b32_e32 v2, s22
+; SDAG-NEXT:    v_mov_b32_e32 v3, s23
+; SDAG-NEXT:    v_mov_b32_e32 v4, s24
+; SDAG-NEXT:    v_mov_b32_e32 v5, s25
+; SDAG-NEXT:    v_mov_b32_e32 v6, s26
+; SDAG-NEXT:    v_mov_b32_e32 v7, s27
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v31
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[24:31], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -3579,34 +3593,48 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    s_mov_b32 s13, s1
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT:    v_mov_b32_e32 v32, s28
-; GISEL-NEXT:    v_mov_b32_e32 v33, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[20:21]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v32
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v33
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v0
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v1
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, v2
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, v3
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, v4
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, v5
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, v6
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, v7
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, v8
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, v9
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, v10
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, v11
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, v12
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT:    v_mov_b32_e32 v18, v0
+; GISEL-NEXT:    v_mov_b32_e32 v19, v1
+; GISEL-NEXT:    v_mov_b32_e32 v20, v2
+; GISEL-NEXT:    v_mov_b32_e32 v21, v3
+; GISEL-NEXT:    v_mov_b32_e32 v22, v4
+; GISEL-NEXT:    v_mov_b32_e32 v23, v5
+; GISEL-NEXT:    v_mov_b32_e32 v24, v6
+; GISEL-NEXT:    v_mov_b32_e32 v25, v7
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-NEXT:    v_mov_b32_e32 v16, s28
+; GISEL-NEXT:    v_mov_b32_e32 v26, v8
+; GISEL-NEXT:    v_mov_b32_e32 v27, v9
+; GISEL-NEXT:    v_mov_b32_e32 v28, v10
+; GISEL-NEXT:    v_mov_b32_e32 v29, v11
+; GISEL-NEXT:    v_mov_b32_e32 v30, v12
+; GISEL-NEXT:    v_mov_b32_e32 v31, v13
+; GISEL-NEXT:    v_mov_b32_e32 v17, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[26:27]
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[20:21]
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v31
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 3
@@ -3843,6 +3871,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
 ; SDAG-NEXT:    v_mov_b32_e32 v26, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v27, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v28, s2
@@ -3851,7 +3880,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v31, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v32, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
 ; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
 ; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
@@ -3898,10 +3926,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
 ; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[18:19]
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[12:13]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
 ; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
@@ -3993,42 +4021,48 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v16, s0
-; SDAG-NEXT:    v_mov_b32_e32 v17, s1
-; SDAG-NEXT:    v_mov_b32_e32 v18, s2
-; SDAG-NEXT:    v_mov_b32_e32 v19, s3
-; SDAG-NEXT:    v_mov_b32_e32 v20, s16
-; SDAG-NEXT:    v_mov_b32_e32 v21, s17
-; SDAG-NEXT:    v_mov_b32_e32 v22, s18
-; SDAG-NEXT:    v_mov_b32_e32 v23, s19
-; SDAG-NEXT:    v_mov_b32_e32 v24, s20
-; SDAG-NEXT:    v_mov_b32_e32 v25, s21
-; SDAG-NEXT:    v_mov_b32_e32 v26, s22
-; SDAG-NEXT:    v_mov_b32_e32 v27, s23
-; SDAG-NEXT:    v_mov_b32_e32 v28, s24
-; SDAG-NEXT:    v_mov_b32_e32 v29, s25
-; SDAG-NEXT:    v_mov_b32_e32 v30, s26
-; SDAG-NEXT:    v_mov_b32_e32 v31, s27
-; SDAG-NEXT:    v_mov_b32_e32 v32, s28
-; SDAG-NEXT:    v_mov_b32_e32 v33, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v27
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v28
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v29
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v30
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v31
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v32
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v33
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v8
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v9
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v10
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v11
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v13
+; SDAG-NEXT:    v_mov_b32_e32 v32, s0
+; SDAG-NEXT:    v_mov_b32_e32 v33, s1
+; SDAG-NEXT:    v_mov_b32_e32 v34, s2
+; SDAG-NEXT:    v_mov_b32_e32 v35, s3
+; SDAG-NEXT:    v_mov_b32_e32 v36, s16
+; SDAG-NEXT:    v_mov_b32_e32 v37, s17
+; SDAG-NEXT:    v_mov_b32_e32 v38, s18
+; SDAG-NEXT:    v_mov_b32_e32 v39, s19
+; SDAG-NEXT:    v_mov_b32_e32 v16, s20
+; SDAG-NEXT:    v_mov_b32_e32 v31, v13
+; SDAG-NEXT:    v_mov_b32_e32 v30, v12
+; SDAG-NEXT:    v_mov_b32_e32 v29, v11
+; SDAG-NEXT:    v_mov_b32_e32 v28, v10
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v17, s21
+; SDAG-NEXT:    v_mov_b32_e32 v18, s22
+; SDAG-NEXT:    v_mov_b32_e32 v19, s23
+; SDAG-NEXT:    v_mov_b32_e32 v20, s24
+; SDAG-NEXT:    v_mov_b32_e32 v21, s25
+; SDAG-NEXT:    v_mov_b32_e32 v22, s26
+; SDAG-NEXT:    v_mov_b32_e32 v23, s27
+; SDAG-NEXT:    v_mov_b32_e32 v24, s28
+; SDAG-NEXT:    v_mov_b32_e32 v25, s29
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v31
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -4057,38 +4091,44 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; GISEL-NEXT:    s_mov_b32 s13, s1
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT:    v_mov_b32_e32 v24, s20
-; GISEL-NEXT:    v_mov_b32_e32 v25, s21
-; GISEL-NEXT:    v_mov_b32_e32 v26, s22
-; GISEL-NEXT:    v_mov_b32_e32 v27, s23
-; GISEL-NEXT:    v_mov_b32_e32 v28, s24
-; GISEL-NEXT:    v_mov_b32_e32 v29, s25
-; GISEL-NEXT:    v_mov_b32_e32 v30, s26
-; GISEL-NEXT:    v_mov_b32_e32 v31, s27
-; GISEL-NEXT:    v_mov_b32_e32 v32, s28
-; GISEL-NEXT:    v_mov_b32_e32 v33, s29
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v24
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v25
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v26
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v27
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, v28
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, v29
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, v30
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, v31
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, v32
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, v33
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, v8
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, v9
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, v10
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, v11
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, v12
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[12:13]
+; GISEL-NEXT:    v_mov_b32_e32 v16, s20
+; GISEL-NEXT:    v_mov_b32_e32 v26, v8
+; GISEL-NEXT:    v_mov_b32_e32 v27, v9
+; GISEL-NEXT:    v_mov_b32_e32 v28, v10
+; GISEL-NEXT:    v_mov_b32_e32 v29, v11
+; GISEL-NEXT:    v_mov_b32_e32 v30, v12
+; GISEL-NEXT:    v_mov_b32_e32 v31, v13
+; GISEL-NEXT:    v_mov_b32_e32 v17, s21
+; GISEL-NEXT:    v_mov_b32_e32 v18, s22
+; GISEL-NEXT:    v_mov_b32_e32 v19, s23
+; GISEL-NEXT:    v_mov_b32_e32 v20, s24
+; GISEL-NEXT:    v_mov_b32_e32 v21, s25
+; GISEL-NEXT:    v_mov_b32_e32 v22, s26
+; GISEL-NEXT:    v_mov_b32_e32 v23, s27
+; GISEL-NEXT:    v_mov_b32_e32 v24, s28
+; GISEL-NEXT:    v_mov_b32_e32 v25, s29
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v31
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 3
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 77d4aad5f3174..08167cd226a4c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -72,46 +72,94 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x64_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x64_f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x64_f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -252,25 +300,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16:
@@ -288,22 +353,14 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -315,25 +372,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
@@ -351,22 +425,14 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -378,25 +444,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
@@ -414,22 +497,14 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -441,94 +516,108 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
+; SDAG-NEXT:    v_mov_b32_e32 v28, s0
+; SDAG-NEXT:    v_mov_b32_e32 v29, s1
+; SDAG-NEXT:    v_mov_b32_e32 v30, s2
+; SDAG-NEXT:    v_mov_b32_e32 v31, s3
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    v_mov_b32_e32 v1, s17
+; SDAG-NEXT:    v_mov_b32_e32 v2, s18
+; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s20
+; SDAG-NEXT:    v_mov_b32_e32 v5, s21
+; SDAG-NEXT:    v_mov_b32_e32 v6, s22
+; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v11, v0
-; GISEL-NEXT:    v_mov_b32_e32 v12, v1
-; GISEL-NEXT:    v_mov_b32_e32 v13, v2
-; GISEL-NEXT:    v_mov_b32_e32 v14, v3
-; GISEL-NEXT:    v_mov_b32_e32 v15, v4
-; GISEL-NEXT:    v_mov_b32_e32 v16, v5
-; GISEL-NEXT:    v_mov_b32_e32 v17, v6
-; GISEL-NEXT:    v_mov_b32_e32 v18, v7
-; GISEL-NEXT:    v_mov_b32_e32 v19, v8
-; GISEL-NEXT:    v_mov_b32_e32 v20, v9
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT:    v_mov_b32_e32 v21, v10
-; GISEL-NEXT:    v_mov_b32_e32 v0, s24
-; GISEL-NEXT:    v_mov_b32_e32 v1, s25
-; GISEL-NEXT:    v_mov_b32_e32 v2, s26
-; GISEL-NEXT:    v_mov_b32_e32 v3, s27
-; GISEL-NEXT:    v_mov_b32_e32 v4, s28
-; GISEL-NEXT:    v_mov_b32_e32 v5, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT:    v_mov_b32_e32 v6, v11
-; GISEL-NEXT:    v_mov_b32_e32 v7, v12
-; GISEL-NEXT:    v_mov_b32_e32 v8, v13
-; GISEL-NEXT:    v_mov_b32_e32 v9, v14
-; GISEL-NEXT:    v_mov_b32_e32 v10, v15
-; GISEL-NEXT:    v_mov_b32_e32 v11, v16
-; GISEL-NEXT:    v_mov_b32_e32 v12, v17
-; GISEL-NEXT:    v_mov_b32_e32 v13, v18
-; GISEL-NEXT:    v_mov_b32_e32 v14, v19
-; GISEL-NEXT:    v_mov_b32_e32 v15, v20
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v18, s24
+; GISEL-NEXT:    v_mov_b32_e32 v19, s25
+; GISEL-NEXT:    v_mov_b32_e32 v24, v0
+; GISEL-NEXT:    v_mov_b32_e32 v25, v1
+; GISEL-NEXT:    v_mov_b32_e32 v26, v2
+; GISEL-NEXT:    v_mov_b32_e32 v27, v3
+; GISEL-NEXT:    v_mov_b32_e32 v28, v4
+; GISEL-NEXT:    v_mov_b32_e32 v29, v5
+; GISEL-NEXT:    v_mov_b32_e32 v30, v6
+; GISEL-NEXT:    v_mov_b32_e32 v31, v7
+; GISEL-NEXT:    v_mov_b32_e32 v32, v8
+; GISEL-NEXT:    v_mov_b32_e32 v33, v9
+; GISEL-NEXT:    v_mov_b32_e32 v16, v10
+; GISEL-NEXT:    v_mov_b32_e32 v20, s26
+; GISEL-NEXT:    v_mov_b32_e32 v21, s27
+; GISEL-NEXT:    v_mov_b32_e32 v22, s28
+; GISEL-NEXT:    v_mov_b32_e32 v23, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[34:37], v[48:55], v16
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -579,12 +668,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo
 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
@@ -594,12 +688,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1
 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
@@ -609,12 +708,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1
 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
@@ -705,25 +809,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -733,25 +854,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
@@ -761,25 +899,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
@@ -789,54 +944,70 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v36, s0
-; GCN-NEXT:    v_mov_b32_e32 v37, s1
-; GCN-NEXT:    v_mov_b32_e32 v38, s2
-; GCN-NEXT:    v_mov_b32_e32 v39, s3
+; GCN-NEXT:    v_mov_b32_e32 v28, s0
+; GCN-NEXT:    v_mov_b32_e32 v29, s1
+; GCN-NEXT:    v_mov_b32_e32 v30, s2
+; GCN-NEXT:    v_mov_b32_e32 v31, s3
+; GCN-NEXT:    v_mov_b32_e32 v12, s24
+; GCN-NEXT:    v_mov_b32_e32 v27, v9
+; GCN-NEXT:    v_mov_b32_e32 v26, v8
+; GCN-NEXT:    v_mov_b32_e32 v25, v7
+; GCN-NEXT:    v_mov_b32_e32 v24, v6
+; GCN-NEXT:    v_mov_b32_e32 v23, v5
+; GCN-NEXT:    v_mov_b32_e32 v22, v4
+; GCN-NEXT:    v_mov_b32_e32 v21, v3
+; GCN-NEXT:    v_mov_b32_e32 v20, v2
+; GCN-NEXT:    v_mov_b32_e32 v19, v1
+; GCN-NEXT:    v_mov_b32_e32 v18, v0
 ; GCN-NEXT:    v_mov_b32_e32 v13, s25
 ; GCN-NEXT:    v_mov_b32_e32 v14, s26
 ; GCN-NEXT:    v_mov_b32_e32 v15, s27
 ; GCN-NEXT:    v_mov_b32_e32 v16, s28
 ; GCN-NEXT:    v_mov_b32_e32 v17, s29
-; GCN-NEXT:    v_mov_b32_e32 v28, s16
-; GCN-NEXT:    v_mov_b32_e32 v29, s17
-; GCN-NEXT:    v_mov_b32_e32 v30, s18
-; GCN-NEXT:    v_mov_b32_e32 v31, s19
-; GCN-NEXT:    v_mov_b32_e32 v32, s20
-; GCN-NEXT:    v_mov_b32_e32 v33, s21
-; GCN-NEXT:    v_mov_b32_e32 v34, s22
-; GCN-NEXT:    v_mov_b32_e32 v35, s23
-; GCN-NEXT:    v_mov_b32_e32 v12, s24
-; GCN-NEXT:    v_mov_b32_e32 v18, v0
-; GCN-NEXT:    v_mov_b32_e32 v19, v1
-; GCN-NEXT:    v_mov_b32_e32 v20, v2
-; GCN-NEXT:    v_mov_b32_e32 v21, v3
-; GCN-NEXT:    v_mov_b32_e32 v22, v4
-; GCN-NEXT:    v_mov_b32_e32 v23, v5
-; GCN-NEXT:    v_mov_b32_e32 v24, v6
-; GCN-NEXT:    v_mov_b32_e32 v25, v7
-; GCN-NEXT:    v_mov_b32_e32 v26, v8
-; GCN-NEXT:    v_mov_b32_e32 v27, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-NEXT:    v_mov_b32_e32 v4, s20
+; GCN-NEXT:    v_mov_b32_e32 v5, s21
+; GCN-NEXT:    v_mov_b32_e32 v6, s22
+; GCN-NEXT:    v_mov_b32_e32 v7, s23
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    v_mov_b32_e32 v4, v16
-; GCN-NEXT:    v_mov_b32_e32 v5, v17
-; GCN-NEXT:    v_mov_b32_e32 v6, v18
-; GCN-NEXT:    v_mov_b32_e32 v7, v19
-; GCN-NEXT:    v_mov_b32_e32 v8, v20
-; GCN-NEXT:    v_mov_b32_e32 v9, v21
-; GCN-NEXT:    v_mov_b32_e32 v10, v22
-; GCN-NEXT:    v_mov_b32_e32 v11, v23
-; GCN-NEXT:    v_mov_b32_e32 v12, v24
-; GCN-NEXT:    v_mov_b32_e32 v13, v25
-; GCN-NEXT:    v_mov_b32_e32 v14, v26
-; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -916,46 +1087,94 @@ bb:
 }
 
 define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_i32_16x16x128_i8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_i32_16x16x128_i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_i32_16x16x128_i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x i32> %result
 }
 
 define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x i32> %result
 }
 
 define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x i32> %result
 }
@@ -1102,25 +1321,42 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8:
@@ -1138,22 +1374,14 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1165,25 +1393,42 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
@@ -1201,22 +1446,14 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1228,25 +1465,42 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
@@ -1264,22 +1518,14 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1291,94 +1537,108 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
+; SDAG-NEXT:    v_mov_b32_e32 v28, s0
+; SDAG-NEXT:    v_mov_b32_e32 v29, s1
+; SDAG-NEXT:    v_mov_b32_e32 v30, s2
+; SDAG-NEXT:    v_mov_b32_e32 v31, s3
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    v_mov_b32_e32 v1, s17
+; SDAG-NEXT:    v_mov_b32_e32 v2, s18
+; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s20
+; SDAG-NEXT:    v_mov_b32_e32 v5, s21
+; SDAG-NEXT:    v_mov_b32_e32 v6, s22
+; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v11, v0
-; GISEL-NEXT:    v_mov_b32_e32 v12, v1
-; GISEL-NEXT:    v_mov_b32_e32 v13, v2
-; GISEL-NEXT:    v_mov_b32_e32 v14, v3
-; GISEL-NEXT:    v_mov_b32_e32 v15, v4
-; GISEL-NEXT:    v_mov_b32_e32 v16, v5
-; GISEL-NEXT:    v_mov_b32_e32 v17, v6
-; GISEL-NEXT:    v_mov_b32_e32 v18, v7
-; GISEL-NEXT:    v_mov_b32_e32 v19, v8
-; GISEL-NEXT:    v_mov_b32_e32 v20, v9
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT:    v_mov_b32_e32 v21, v10
-; GISEL-NEXT:    v_mov_b32_e32 v0, s24
-; GISEL-NEXT:    v_mov_b32_e32 v1, s25
-; GISEL-NEXT:    v_mov_b32_e32 v2, s26
-; GISEL-NEXT:    v_mov_b32_e32 v3, s27
-; GISEL-NEXT:    v_mov_b32_e32 v4, s28
-; GISEL-NEXT:    v_mov_b32_e32 v5, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT:    v_mov_b32_e32 v6, v11
-; GISEL-NEXT:    v_mov_b32_e32 v7, v12
-; GISEL-NEXT:    v_mov_b32_e32 v8, v13
-; GISEL-NEXT:    v_mov_b32_e32 v9, v14
-; GISEL-NEXT:    v_mov_b32_e32 v10, v15
-; GISEL-NEXT:    v_mov_b32_e32 v11, v16
-; GISEL-NEXT:    v_mov_b32_e32 v12, v17
-; GISEL-NEXT:    v_mov_b32_e32 v13, v18
-; GISEL-NEXT:    v_mov_b32_e32 v14, v19
-; GISEL-NEXT:    v_mov_b32_e32 v15, v20
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v18, s24
+; GISEL-NEXT:    v_mov_b32_e32 v19, s25
+; GISEL-NEXT:    v_mov_b32_e32 v24, v0
+; GISEL-NEXT:    v_mov_b32_e32 v25, v1
+; GISEL-NEXT:    v_mov_b32_e32 v26, v2
+; GISEL-NEXT:    v_mov_b32_e32 v27, v3
+; GISEL-NEXT:    v_mov_b32_e32 v28, v4
+; GISEL-NEXT:    v_mov_b32_e32 v29, v5
+; GISEL-NEXT:    v_mov_b32_e32 v30, v6
+; GISEL-NEXT:    v_mov_b32_e32 v31, v7
+; GISEL-NEXT:    v_mov_b32_e32 v32, v8
+; GISEL-NEXT:    v_mov_b32_e32 v33, v9
+; GISEL-NEXT:    v_mov_b32_e32 v16, v10
+; GISEL-NEXT:    v_mov_b32_e32 v20, s26
+; GISEL-NEXT:    v_mov_b32_e32 v21, s27
+; GISEL-NEXT:    v_mov_b32_e32 v22, s28
+; GISEL-NEXT:    v_mov_b32_e32 v23, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[34:37], v[48:55], v16
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x i32> %result
@@ -1458,46 +1718,94 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -1627,46 +1935,94 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -1796,46 +2152,94 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -1965,46 +2369,94 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_mov_b32_e32 v0, v12
-; GCN-NEXT:    v_mov_b32_e32 v1, v13
-; GCN-NEXT:    v_mov_b32_e32 v2, v14
-; GCN-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -2151,25 +2603,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
@@ -2186,23 +2655,15 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
 ; GISEL-NEXT:    v_mov_b32_e32 v34, v8
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b32_e32 v37, v11
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2214,25 +2675,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
@@ -2250,22 +2728,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2277,25 +2747,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
@@ -2313,22 +2800,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2340,94 +2819,108 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
+; SDAG-NEXT:    v_mov_b32_e32 v28, s0
+; SDAG-NEXT:    v_mov_b32_e32 v29, s1
+; SDAG-NEXT:    v_mov_b32_e32 v30, s2
+; SDAG-NEXT:    v_mov_b32_e32 v31, s3
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    v_mov_b32_e32 v1, s17
+; SDAG-NEXT:    v_mov_b32_e32 v2, s18
+; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s20
+; SDAG-NEXT:    v_mov_b32_e32 v5, s21
+; SDAG-NEXT:    v_mov_b32_e32 v6, s22
+; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v11, v0
-; GISEL-NEXT:    v_mov_b32_e32 v12, v1
-; GISEL-NEXT:    v_mov_b32_e32 v13, v2
-; GISEL-NEXT:    v_mov_b32_e32 v14, v3
-; GISEL-NEXT:    v_mov_b32_e32 v15, v4
-; GISEL-NEXT:    v_mov_b32_e32 v16, v5
-; GISEL-NEXT:    v_mov_b32_e32 v17, v6
-; GISEL-NEXT:    v_mov_b32_e32 v18, v7
-; GISEL-NEXT:    v_mov_b32_e32 v19, v8
-; GISEL-NEXT:    v_mov_b32_e32 v20, v9
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT:    v_mov_b32_e32 v21, v10
-; GISEL-NEXT:    v_mov_b32_e32 v0, s24
-; GISEL-NEXT:    v_mov_b32_e32 v1, s25
-; GISEL-NEXT:    v_mov_b32_e32 v2, s26
-; GISEL-NEXT:    v_mov_b32_e32 v3, s27
-; GISEL-NEXT:    v_mov_b32_e32 v4, s28
-; GISEL-NEXT:    v_mov_b32_e32 v5, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT:    v_mov_b32_e32 v6, v11
-; GISEL-NEXT:    v_mov_b32_e32 v7, v12
-; GISEL-NEXT:    v_mov_b32_e32 v8, v13
-; GISEL-NEXT:    v_mov_b32_e32 v9, v14
-; GISEL-NEXT:    v_mov_b32_e32 v10, v15
-; GISEL-NEXT:    v_mov_b32_e32 v11, v16
-; GISEL-NEXT:    v_mov_b32_e32 v12, v17
-; GISEL-NEXT:    v_mov_b32_e32 v13, v18
-; GISEL-NEXT:    v_mov_b32_e32 v14, v19
-; GISEL-NEXT:    v_mov_b32_e32 v15, v20
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v18, s24
+; GISEL-NEXT:    v_mov_b32_e32 v19, s25
+; GISEL-NEXT:    v_mov_b32_e32 v24, v0
+; GISEL-NEXT:    v_mov_b32_e32 v25, v1
+; GISEL-NEXT:    v_mov_b32_e32 v26, v2
+; GISEL-NEXT:    v_mov_b32_e32 v27, v3
+; GISEL-NEXT:    v_mov_b32_e32 v28, v4
+; GISEL-NEXT:    v_mov_b32_e32 v29, v5
+; GISEL-NEXT:    v_mov_b32_e32 v30, v6
+; GISEL-NEXT:    v_mov_b32_e32 v31, v7
+; GISEL-NEXT:    v_mov_b32_e32 v32, v8
+; GISEL-NEXT:    v_mov_b32_e32 v33, v9
+; GISEL-NEXT:    v_mov_b32_e32 v16, v10
+; GISEL-NEXT:    v_mov_b32_e32 v20, s26
+; GISEL-NEXT:    v_mov_b32_e32 v21, s27
+; GISEL-NEXT:    v_mov_b32_e32 v22, s28
+; GISEL-NEXT:    v_mov_b32_e32 v23, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[34:37], v[48:55], v16
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -2524,25 +3017,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
@@ -2560,22 +3070,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2587,25 +3089,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
@@ -2623,22 +3142,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2650,25 +3161,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
@@ -2686,22 +3214,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2713,94 +3233,108 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
+; SDAG-NEXT:    v_mov_b32_e32 v28, s0
+; SDAG-NEXT:    v_mov_b32_e32 v29, s1
+; SDAG-NEXT:    v_mov_b32_e32 v30, s2
+; SDAG-NEXT:    v_mov_b32_e32 v31, s3
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    v_mov_b32_e32 v1, s17
+; SDAG-NEXT:    v_mov_b32_e32 v2, s18
+; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s20
+; SDAG-NEXT:    v_mov_b32_e32 v5, s21
+; SDAG-NEXT:    v_mov_b32_e32 v6, s22
+; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v11, v0
-; GISEL-NEXT:    v_mov_b32_e32 v12, v1
-; GISEL-NEXT:    v_mov_b32_e32 v13, v2
-; GISEL-NEXT:    v_mov_b32_e32 v14, v3
-; GISEL-NEXT:    v_mov_b32_e32 v15, v4
-; GISEL-NEXT:    v_mov_b32_e32 v16, v5
-; GISEL-NEXT:    v_mov_b32_e32 v17, v6
-; GISEL-NEXT:    v_mov_b32_e32 v18, v7
-; GISEL-NEXT:    v_mov_b32_e32 v19, v8
-; GISEL-NEXT:    v_mov_b32_e32 v20, v9
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT:    v_mov_b32_e32 v21, v10
-; GISEL-NEXT:    v_mov_b32_e32 v0, s24
-; GISEL-NEXT:    v_mov_b32_e32 v1, s25
-; GISEL-NEXT:    v_mov_b32_e32 v2, s26
-; GISEL-NEXT:    v_mov_b32_e32 v3, s27
-; GISEL-NEXT:    v_mov_b32_e32 v4, s28
-; GISEL-NEXT:    v_mov_b32_e32 v5, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT:    v_mov_b32_e32 v6, v11
-; GISEL-NEXT:    v_mov_b32_e32 v7, v12
-; GISEL-NEXT:    v_mov_b32_e32 v8, v13
-; GISEL-NEXT:    v_mov_b32_e32 v9, v14
-; GISEL-NEXT:    v_mov_b32_e32 v10, v15
-; GISEL-NEXT:    v_mov_b32_e32 v11, v16
-; GISEL-NEXT:    v_mov_b32_e32 v12, v17
-; GISEL-NEXT:    v_mov_b32_e32 v13, v18
-; GISEL-NEXT:    v_mov_b32_e32 v14, v19
-; GISEL-NEXT:    v_mov_b32_e32 v15, v20
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v18, s24
+; GISEL-NEXT:    v_mov_b32_e32 v19, s25
+; GISEL-NEXT:    v_mov_b32_e32 v24, v0
+; GISEL-NEXT:    v_mov_b32_e32 v25, v1
+; GISEL-NEXT:    v_mov_b32_e32 v26, v2
+; GISEL-NEXT:    v_mov_b32_e32 v27, v3
+; GISEL-NEXT:    v_mov_b32_e32 v28, v4
+; GISEL-NEXT:    v_mov_b32_e32 v29, v5
+; GISEL-NEXT:    v_mov_b32_e32 v30, v6
+; GISEL-NEXT:    v_mov_b32_e32 v31, v7
+; GISEL-NEXT:    v_mov_b32_e32 v32, v8
+; GISEL-NEXT:    v_mov_b32_e32 v33, v9
+; GISEL-NEXT:    v_mov_b32_e32 v16, v10
+; GISEL-NEXT:    v_mov_b32_e32 v20, s26
+; GISEL-NEXT:    v_mov_b32_e32 v21, s27
+; GISEL-NEXT:    v_mov_b32_e32 v22, s28
+; GISEL-NEXT:    v_mov_b32_e32 v23, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[34:37], v[48:55], v16
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -2897,25 +3431,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
@@ -2933,22 +3484,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2960,25 +3503,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
@@ -2996,22 +3556,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3023,25 +3575,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
@@ -3059,22 +3628,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3086,94 +3647,108 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
+; SDAG-NEXT:    v_mov_b32_e32 v28, s0
+; SDAG-NEXT:    v_mov_b32_e32 v29, s1
+; SDAG-NEXT:    v_mov_b32_e32 v30, s2
+; SDAG-NEXT:    v_mov_b32_e32 v31, s3
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    v_mov_b32_e32 v1, s17
+; SDAG-NEXT:    v_mov_b32_e32 v2, s18
+; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s20
+; SDAG-NEXT:    v_mov_b32_e32 v5, s21
+; SDAG-NEXT:    v_mov_b32_e32 v6, s22
+; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v11, v0
-; GISEL-NEXT:    v_mov_b32_e32 v12, v1
-; GISEL-NEXT:    v_mov_b32_e32 v13, v2
-; GISEL-NEXT:    v_mov_b32_e32 v14, v3
-; GISEL-NEXT:    v_mov_b32_e32 v15, v4
-; GISEL-NEXT:    v_mov_b32_e32 v16, v5
-; GISEL-NEXT:    v_mov_b32_e32 v17, v6
-; GISEL-NEXT:    v_mov_b32_e32 v18, v7
-; GISEL-NEXT:    v_mov_b32_e32 v19, v8
-; GISEL-NEXT:    v_mov_b32_e32 v20, v9
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT:    v_mov_b32_e32 v21, v10
-; GISEL-NEXT:    v_mov_b32_e32 v0, s24
-; GISEL-NEXT:    v_mov_b32_e32 v1, s25
-; GISEL-NEXT:    v_mov_b32_e32 v2, s26
-; GISEL-NEXT:    v_mov_b32_e32 v3, s27
-; GISEL-NEXT:    v_mov_b32_e32 v4, s28
-; GISEL-NEXT:    v_mov_b32_e32 v5, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT:    v_mov_b32_e32 v6, v11
-; GISEL-NEXT:    v_mov_b32_e32 v7, v12
-; GISEL-NEXT:    v_mov_b32_e32 v8, v13
-; GISEL-NEXT:    v_mov_b32_e32 v9, v14
-; GISEL-NEXT:    v_mov_b32_e32 v10, v15
-; GISEL-NEXT:    v_mov_b32_e32 v11, v16
-; GISEL-NEXT:    v_mov_b32_e32 v12, v17
-; GISEL-NEXT:    v_mov_b32_e32 v13, v18
-; GISEL-NEXT:    v_mov_b32_e32 v14, v19
-; GISEL-NEXT:    v_mov_b32_e32 v15, v20
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v18, s24
+; GISEL-NEXT:    v_mov_b32_e32 v19, s25
+; GISEL-NEXT:    v_mov_b32_e32 v24, v0
+; GISEL-NEXT:    v_mov_b32_e32 v25, v1
+; GISEL-NEXT:    v_mov_b32_e32 v26, v2
+; GISEL-NEXT:    v_mov_b32_e32 v27, v3
+; GISEL-NEXT:    v_mov_b32_e32 v28, v4
+; GISEL-NEXT:    v_mov_b32_e32 v29, v5
+; GISEL-NEXT:    v_mov_b32_e32 v30, v6
+; GISEL-NEXT:    v_mov_b32_e32 v31, v7
+; GISEL-NEXT:    v_mov_b32_e32 v32, v8
+; GISEL-NEXT:    v_mov_b32_e32 v33, v9
+; GISEL-NEXT:    v_mov_b32_e32 v16, v10
+; GISEL-NEXT:    v_mov_b32_e32 v20, s26
+; GISEL-NEXT:    v_mov_b32_e32 v21, s27
+; GISEL-NEXT:    v_mov_b32_e32 v22, s28
+; GISEL-NEXT:    v_mov_b32_e32 v23, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[34:37], v[48:55], v16
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -3270,25 +3845,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
@@ -3306,22 +3898,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3333,25 +3917,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
@@ -3369,22 +3970,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3396,25 +3989,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
@@ -3432,22 +4042,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
 ; GISEL-NEXT:    v_mov_b32_e32 v35, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v36, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    v_mov_b32_e32 v4, v16
-; GISEL-NEXT:    v_mov_b32_e32 v5, v17
-; GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GISEL-NEXT:    v_mov_b32_e32 v7, v19
-; GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GISEL-NEXT:    v_mov_b32_e32 v9, v21
-; GISEL-NEXT:    v_mov_b32_e32 v10, v22
-; GISEL-NEXT:    v_mov_b32_e32 v11, v23
-; GISEL-NEXT:    v_mov_b32_e32 v12, v24
-; GISEL-NEXT:    v_mov_b32_e32 v13, v25
-; GISEL-NEXT:    v_mov_b32_e32 v14, v26
-; GISEL-NEXT:    v_mov_b32_e32 v15, v27
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3459,94 +4061,108 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s0
-; SDAG-NEXT:    v_mov_b32_e32 v37, s1
-; SDAG-NEXT:    v_mov_b32_e32 v38, s2
-; SDAG-NEXT:    v_mov_b32_e32 v39, s3
+; SDAG-NEXT:    v_mov_b32_e32 v28, s0
+; SDAG-NEXT:    v_mov_b32_e32 v29, s1
+; SDAG-NEXT:    v_mov_b32_e32 v30, s2
+; SDAG-NEXT:    v_mov_b32_e32 v31, s3
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
+; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_mov_b32_e32 v26, v8
+; SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; SDAG-NEXT:    v_mov_b32_e32 v24, v6
+; SDAG-NEXT:    v_mov_b32_e32 v23, v5
+; SDAG-NEXT:    v_mov_b32_e32 v22, v4
+; SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; SDAG-NEXT:    v_mov_b32_e32 v20, v2
+; SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; SDAG-NEXT:    v_mov_b32_e32 v18, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    v_mov_b32_e32 v29, s17
-; SDAG-NEXT:    v_mov_b32_e32 v30, s18
-; SDAG-NEXT:    v_mov_b32_e32 v31, s19
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    v_mov_b32_e32 v1, s17
+; SDAG-NEXT:    v_mov_b32_e32 v2, s18
+; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s20
+; SDAG-NEXT:    v_mov_b32_e32 v5, s21
+; SDAG-NEXT:    v_mov_b32_e32 v6, s22
+; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_mov_b32_e32 v0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v1, v13
-; SDAG-NEXT:    v_mov_b32_e32 v2, v14
-; SDAG-NEXT:    v_mov_b32_e32 v3, v15
-; SDAG-NEXT:    v_mov_b32_e32 v4, v16
-; SDAG-NEXT:    v_mov_b32_e32 v5, v17
-; SDAG-NEXT:    v_mov_b32_e32 v6, v18
-; SDAG-NEXT:    v_mov_b32_e32 v7, v19
-; SDAG-NEXT:    v_mov_b32_e32 v8, v20
-; SDAG-NEXT:    v_mov_b32_e32 v9, v21
-; SDAG-NEXT:    v_mov_b32_e32 v10, v22
-; SDAG-NEXT:    v_mov_b32_e32 v11, v23
-; SDAG-NEXT:    v_mov_b32_e32 v12, v24
-; SDAG-NEXT:    v_mov_b32_e32 v13, v25
-; SDAG-NEXT:    v_mov_b32_e32 v14, v26
-; SDAG-NEXT:    v_mov_b32_e32 v15, v27
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v11, v0
-; GISEL-NEXT:    v_mov_b32_e32 v12, v1
-; GISEL-NEXT:    v_mov_b32_e32 v13, v2
-; GISEL-NEXT:    v_mov_b32_e32 v14, v3
-; GISEL-NEXT:    v_mov_b32_e32 v15, v4
-; GISEL-NEXT:    v_mov_b32_e32 v16, v5
-; GISEL-NEXT:    v_mov_b32_e32 v17, v6
-; GISEL-NEXT:    v_mov_b32_e32 v18, v7
-; GISEL-NEXT:    v_mov_b32_e32 v19, v8
-; GISEL-NEXT:    v_mov_b32_e32 v20, v9
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT:    v_mov_b32_e32 v21, v10
-; GISEL-NEXT:    v_mov_b32_e32 v0, s24
-; GISEL-NEXT:    v_mov_b32_e32 v1, s25
-; GISEL-NEXT:    v_mov_b32_e32 v2, s26
-; GISEL-NEXT:    v_mov_b32_e32 v3, s27
-; GISEL-NEXT:    v_mov_b32_e32 v4, s28
-; GISEL-NEXT:    v_mov_b32_e32 v5, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT:    v_mov_b32_e32 v6, v11
-; GISEL-NEXT:    v_mov_b32_e32 v7, v12
-; GISEL-NEXT:    v_mov_b32_e32 v8, v13
-; GISEL-NEXT:    v_mov_b32_e32 v9, v14
-; GISEL-NEXT:    v_mov_b32_e32 v10, v15
-; GISEL-NEXT:    v_mov_b32_e32 v11, v16
-; GISEL-NEXT:    v_mov_b32_e32 v12, v17
-; GISEL-NEXT:    v_mov_b32_e32 v13, v18
-; GISEL-NEXT:    v_mov_b32_e32 v14, v19
-; GISEL-NEXT:    v_mov_b32_e32 v15, v20
+; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v18, s24
+; GISEL-NEXT:    v_mov_b32_e32 v19, s25
+; GISEL-NEXT:    v_mov_b32_e32 v24, v0
+; GISEL-NEXT:    v_mov_b32_e32 v25, v1
+; GISEL-NEXT:    v_mov_b32_e32 v26, v2
+; GISEL-NEXT:    v_mov_b32_e32 v27, v3
+; GISEL-NEXT:    v_mov_b32_e32 v28, v4
+; GISEL-NEXT:    v_mov_b32_e32 v29, v5
+; GISEL-NEXT:    v_mov_b32_e32 v30, v6
+; GISEL-NEXT:    v_mov_b32_e32 v31, v7
+; GISEL-NEXT:    v_mov_b32_e32 v32, v8
+; GISEL-NEXT:    v_mov_b32_e32 v33, v9
+; GISEL-NEXT:    v_mov_b32_e32 v16, v10
+; GISEL-NEXT:    v_mov_b32_e32 v20, s26
+; GISEL-NEXT:    v_mov_b32_e32 v21, s27
+; GISEL-NEXT:    v_mov_b32_e32 v22, s28
+; GISEL-NEXT:    v_mov_b32_e32 v23, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[34:37], v[48:55], v16
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index 21af2dde2c4bf..3fffb04de1a63 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -96,66 +96,66 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    s_nop 7
 ; GREEDY908-NEXT:    s_nop 7
 ; GREEDY908-NEXT:    s_nop 1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a32
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v5, a61
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a61
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a60
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a2, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a33
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v7, a59
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v8, a58
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a3, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a34
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v9, a57
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v10, a56
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a4, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a35
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v11, a55
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v12, a54
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a5, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a36
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v13, a53
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v14, a52
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a6, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a37
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v15, a51
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v16, a50
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a7, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a38
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v17, a49
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v18, a48
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a8, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a39
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v19, a47
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a46
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a9, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a40
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a16, v2
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a17, v19
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a10, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a41
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a18, v18
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a19, v17
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a11, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a42
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a20, v16
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a21, v15
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a12, v1
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a59
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a31, v1
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a30, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a29, v2
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a58
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a57
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a56
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a28, v1
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a27, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a26, v2
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a55
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a54
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a53
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a25, v1
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a24, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a23, v2
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a52
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a51
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a50
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a22, v1
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a21, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a20, v2
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a49
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a48
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a47
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a19, v1
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a18, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a17, v2
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a46
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a45
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a44
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a16, v1
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a15, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a14, v2
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a43
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a22, v14
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a23, v13
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a42
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a41
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a13, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a44
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a24, v12
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a25, v11
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a14, v1
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a45
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a26, v10
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a27, v9
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a15, v1
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a28, v8
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a29, v7
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a30, v6
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a31, v5
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a12, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a11, v2
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a40
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a39
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a38
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a10, v1
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a9, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a8, v2
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a37
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a36
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a35
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a7, v1
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a6, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a5, v2
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a34
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a33
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a32
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a4, v1
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a3, v6
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a2, v2
 ; GREEDY908-NEXT:    s_nop 0
 ; GREEDY908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
 ; GREEDY908-NEXT:    s_nop 7
@@ -266,36 +266,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-NEXT:    s_nop 7
 ; GREEDY90A-NEXT:    s_nop 7
 ; GREEDY90A-NEXT:    s_nop 2
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a2, a32
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a3, a33
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a4, a34
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a5, a35
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a6, a36
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a7, a37
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a8, a38
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a9, a39
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a10, a40
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a11, a41
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a12, a42
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a13, a43
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a14, a44
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a15, a45
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a16, a46
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a17, a47
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a18, a48
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a19, a49
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a20, a50
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a21, a51
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a22, a52
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a23, a53
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a24, a54
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a25, a55
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a26, a56
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a27, a57
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a28, a58
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a29, a59
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a30, a60
 ; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a31, a61
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a30, a60
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a29, a59
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a28, a58
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a27, a57
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a26, a56
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a25, a55
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a24, a54
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a23, a53
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a22, a52
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a21, a51
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a20, a50
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a19, a49
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a18, a48
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a17, a47
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a16, a46
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a15, a45
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a14, a44
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a13, a43
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a12, a42
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a11, a41
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a10, a40
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a9, a39
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a8, a38
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a7, a37
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a6, a36
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a5, a35
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a4, a34
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a3, a33
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a2, a32
 ; GREEDY90A-NEXT:    s_nop 1
 ; GREEDY90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
 ; GREEDY90A-NEXT:    s_nop 7
@@ -359,36 +359,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY942-NEXT:    s_nop 7
 ; GREEDY942-NEXT:    s_nop 7
 ; GREEDY942-NEXT:    s_nop 1
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a2, a32
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a3, a33
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a4, a34
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a5, a35
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a6, a36
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a7, a37
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a8, a38
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a9, a39
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a10, a40
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a11, a41
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a12, a42
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a13, a43
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a14, a44
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a15, a45
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a16, a46
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a17, a47
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a18, a48
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a19, a49
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a20, a50
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a21, a51
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a22, a52
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a23, a53
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a24, a54
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a25, a55
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a26, a56
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a27, a57
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a28, a58
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a29, a59
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a30, a60
 ; GREEDY942-NEXT:    v_accvgpr_mov_b32 a31, a61
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a30, a60
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a29, a59
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a28, a58
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a27, a57
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a26, a56
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a25, a55
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a24, a54
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a23, a53
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a22, a52
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a21, a51
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a20, a50
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a19, a49
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a18, a48
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a17, a47
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a16, a46
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a15, a45
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a14, a44
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a13, a43
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a12, a42
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a11, a41
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a10, a40
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a9, a39
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a8, a38
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a7, a37
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a6, a36
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a5, a35
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a4, a34
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a3, a33
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a2, a32
 ; GREEDY942-NEXT:    s_nop 1
 ; GREEDY942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
 ; GREEDY942-NEXT:    s_nop 7
@@ -507,106 +507,76 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
 ; FAST90A-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x0
 ; FAST90A-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x40
 ; FAST90A-NEXT:    s_waitcnt lgkmcnt(0)
-; FAST90A-NEXT:    v_accvgpr_write_b32 a32, s36
-; FAST90A-NEXT:    v_accvgpr_write_b32 a33, s37
-; FAST90A-NEXT:    v_accvgpr_write_b32 a34, s38
-; FAST90A-NEXT:    v_accvgpr_write_b32 a35, s39
-; FAST90A-NEXT:    v_accvgpr_write_b32 a36, s40
-; FAST90A-NEXT:    v_accvgpr_write_b32 a37, s41
-; FAST90A-NEXT:    v_accvgpr_write_b32 a38, s42
-; FAST90A-NEXT:    v_accvgpr_write_b32 a39, s43
-; FAST90A-NEXT:    v_accvgpr_write_b32 a40, s44
-; FAST90A-NEXT:    v_accvgpr_write_b32 a41, s45
-; FAST90A-NEXT:    v_accvgpr_write_b32 a42, s46
-; FAST90A-NEXT:    v_accvgpr_write_b32 a43, s47
-; FAST90A-NEXT:    v_accvgpr_write_b32 a44, s48
-; FAST90A-NEXT:    v_accvgpr_write_b32 a45, s49
-; FAST90A-NEXT:    v_accvgpr_write_b32 a46, s50
-; FAST90A-NEXT:    v_accvgpr_write_b32 a47, s51
-; FAST90A-NEXT:    v_accvgpr_write_b32 a48, s4
-; FAST90A-NEXT:    v_accvgpr_write_b32 a49, s5
-; FAST90A-NEXT:    v_accvgpr_write_b32 a50, s6
-; FAST90A-NEXT:    v_accvgpr_write_b32 a51, s7
-; FAST90A-NEXT:    v_accvgpr_write_b32 a52, s8
-; FAST90A-NEXT:    v_accvgpr_write_b32 a53, s9
-; FAST90A-NEXT:    v_accvgpr_write_b32 a54, s10
-; FAST90A-NEXT:    v_accvgpr_write_b32 a55, s11
-; FAST90A-NEXT:    v_accvgpr_write_b32 a56, s12
-; FAST90A-NEXT:    v_accvgpr_write_b32 a57, s13
-; FAST90A-NEXT:    v_accvgpr_write_b32 a58, s14
-; FAST90A-NEXT:    v_accvgpr_write_b32 a59, s15
-; FAST90A-NEXT:    v_accvgpr_write_b32 a60, s16
-; FAST90A-NEXT:    v_accvgpr_write_b32 a61, s17
-; FAST90A-NEXT:    v_accvgpr_write_b32 a62, s18
-; FAST90A-NEXT:    v_accvgpr_write_b32 a63, s19
+; FAST90A-NEXT:    v_accvgpr_write_b32 a64, s36
+; FAST90A-NEXT:    v_accvgpr_write_b32 a65, s37
+; FAST90A-NEXT:    v_accvgpr_write_b32 a66, s38
+; FAST90A-NEXT:    v_accvgpr_write_b32 a67, s39
+; FAST90A-NEXT:    v_accvgpr_write_b32 a68, s40
+; FAST90A-NEXT:    v_accvgpr_write_b32 a69, s41
+; FAST90A-NEXT:    v_accvgpr_write_b32 a70, s42
+; FAST90A-NEXT:    v_accvgpr_write_b32 a71, s43
+; FAST90A-NEXT:    v_accvgpr_write_b32 a72, s44
+; FAST90A-NEXT:    v_accvgpr_write_b32 a73, s45
+; FAST90A-NEXT:    v_accvgpr_write_b32 a74, s46
+; FAST90A-NEXT:    v_accvgpr_write_b32 a75, s47
+; FAST90A-NEXT:    v_accvgpr_write_b32 a76, s48
+; FAST90A-NEXT:    v_accvgpr_write_b32 a77, s49
+; FAST90A-NEXT:    v_accvgpr_write_b32 a78, s50
+; FAST90A-NEXT:    v_accvgpr_write_b32 a79, s51
+; FAST90A-NEXT:    v_accvgpr_write_b32 a80, s4
+; FAST90A-NEXT:    v_accvgpr_write_b32 a81, s5
+; FAST90A-NEXT:    v_accvgpr_write_b32 a82, s6
+; FAST90A-NEXT:    v_accvgpr_write_b32 a83, s7
+; FAST90A-NEXT:    v_accvgpr_write_b32 a84, s8
+; FAST90A-NEXT:    v_accvgpr_write_b32 a85, s9
+; FAST90A-NEXT:    v_accvgpr_write_b32 a86, s10
+; FAST90A-NEXT:    v_accvgpr_write_b32 a87, s11
+; FAST90A-NEXT:    v_accvgpr_write_b32 a88, s12
+; FAST90A-NEXT:    v_accvgpr_write_b32 a89, s13
+; FAST90A-NEXT:    v_accvgpr_write_b32 a90, s14
+; FAST90A-NEXT:    v_accvgpr_write_b32 a91, s15
+; FAST90A-NEXT:    v_accvgpr_write_b32 a92, s16
+; FAST90A-NEXT:    v_accvgpr_write_b32 a93, s17
+; FAST90A-NEXT:    v_accvgpr_write_b32 a94, s18
+; FAST90A-NEXT:    v_accvgpr_write_b32 a95, s19
 ; FAST90A-NEXT:    s_nop 1
-; FAST90A-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; FAST90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63]
+; FAST90A-NEXT:    v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; FAST90A-NEXT:    v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[64:95]
 ; FAST90A-NEXT:    s_nop 7
 ; FAST90A-NEXT:    s_nop 7
-; FAST90A-NEXT:    s_nop 2
-; FAST90A-NEXT:    v_accvgpr_read_b32 v3, a29
-; FAST90A-NEXT:    v_accvgpr_read_b32 v4, a28
-; FAST90A-NEXT:    v_accvgpr_read_b32 v5, a27
-; FAST90A-NEXT:    v_accvgpr_read_b32 v6, a26
-; FAST90A-NEXT:    v_accvgpr_read_b32 v7, a25
-; FAST90A-NEXT:    v_accvgpr_read_b32 v8, a24
-; FAST90A-NEXT:    v_accvgpr_read_b32 v9, a23
-; FAST90A-NEXT:    v_accvgpr_read_b32 v10, a22
-; FAST90A-NEXT:    v_accvgpr_read_b32 v11, a21
-; FAST90A-NEXT:    v_accvgpr_read_b32 v12, a20
-; FAST90A-NEXT:    v_accvgpr_read_b32 v13, a19
-; FAST90A-NEXT:    v_accvgpr_read_b32 v14, a18
-; FAST90A-NEXT:    v_accvgpr_read_b32 v15, a17
-; FAST90A-NEXT:    v_accvgpr_read_b32 v16, a16
-; FAST90A-NEXT:    v_accvgpr_read_b32 v17, a15
-; FAST90A-NEXT:    v_accvgpr_read_b32 v18, a14
-; FAST90A-NEXT:    v_accvgpr_read_b32 v19, a13
-; FAST90A-NEXT:    v_accvgpr_read_b32 v20, a12
-; FAST90A-NEXT:    v_accvgpr_read_b32 v21, a11
-; FAST90A-NEXT:    v_accvgpr_read_b32 v22, a10
-; FAST90A-NEXT:    v_accvgpr_read_b32 v23, a9
-; FAST90A-NEXT:    v_accvgpr_read_b32 v24, a8
-; FAST90A-NEXT:    v_accvgpr_read_b32 v25, a7
-; FAST90A-NEXT:    v_accvgpr_read_b32 v26, a6
-; FAST90A-NEXT:    v_accvgpr_read_b32 v27, a5
-; FAST90A-NEXT:    v_accvgpr_read_b32 v28, a4
-; FAST90A-NEXT:    v_accvgpr_read_b32 v29, a3
-; FAST90A-NEXT:    v_accvgpr_read_b32 v30, a2
-; FAST90A-NEXT:    v_accvgpr_read_b32 v31, a1
-; FAST90A-NEXT:    v_accvgpr_read_b32 v32, a0
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a0, a32
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a1, a33
-; FAST90A-NEXT:    v_accvgpr_write_b32 a2, v32
-; FAST90A-NEXT:    v_accvgpr_write_b32 a3, v31
-; FAST90A-NEXT:    v_accvgpr_write_b32 a4, v30
-; FAST90A-NEXT:    v_accvgpr_write_b32 a5, v29
-; FAST90A-NEXT:    v_accvgpr_write_b32 a6, v28
-; FAST90A-NEXT:    v_accvgpr_write_b32 a7, v27
-; FAST90A-NEXT:    v_accvgpr_write_b32 a8, v26
-; FAST90A-NEXT:    v_accvgpr_write_b32 a9, v25
-; FAST90A-NEXT:    v_accvgpr_write_b32 a10, v24
-; FAST90A-NEXT:    v_accvgpr_write_b32 a11, v23
-; FAST90A-NEXT:    v_accvgpr_write_b32 a12, v22
-; FAST90A-NEXT:    v_accvgpr_write_b32 a13, v21
-; FAST90A-NEXT:    v_accvgpr_write_b32 a14, v20
-; FAST90A-NEXT:    v_accvgpr_write_b32 a15, v19
-; FAST90A-NEXT:    v_accvgpr_write_b32 a16, v18
-; FAST90A-NEXT:    v_accvgpr_write_b32 a17, v17
-; FAST90A-NEXT:    v_accvgpr_write_b32 a18, v16
-; FAST90A-NEXT:    v_accvgpr_write_b32 a19, v15
-; FAST90A-NEXT:    v_accvgpr_write_b32 a20, v14
-; FAST90A-NEXT:    v_accvgpr_write_b32 a21, v13
-; FAST90A-NEXT:    v_accvgpr_write_b32 a22, v12
-; FAST90A-NEXT:    v_accvgpr_write_b32 a23, v11
-; FAST90A-NEXT:    v_accvgpr_write_b32 a24, v10
-; FAST90A-NEXT:    v_accvgpr_write_b32 a25, v9
-; FAST90A-NEXT:    v_accvgpr_write_b32 a26, v8
-; FAST90A-NEXT:    v_accvgpr_write_b32 a27, v7
-; FAST90A-NEXT:    v_accvgpr_write_b32 a28, v6
-; FAST90A-NEXT:    v_accvgpr_write_b32 a29, v5
-; FAST90A-NEXT:    v_accvgpr_write_b32 a30, v4
-; FAST90A-NEXT:    v_accvgpr_write_b32 a31, v3
+; FAST90A-NEXT:    s_nop 1
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a1, a65
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a0, a64
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a31, a61
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a30, a60
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a29, a59
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a28, a58
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a27, a57
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a26, a56
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a25, a55
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a24, a54
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a23, a53
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a22, a52
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a21, a51
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a20, a50
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a19, a49
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a18, a48
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a17, a47
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a16, a46
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a15, a45
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a14, a44
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a13, a43
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a12, a42
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a11, a41
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a10, a40
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a9, a39
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a8, a38
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a7, a37
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a6, a36
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a5, a35
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a4, a34
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a3, a33
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a2, a32
 ; FAST90A-NEXT:    s_nop 1
 ; FAST90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
 ; FAST90A-NEXT:    s_nop 7
@@ -636,42 +606,42 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908:       ; %bb.0: ; %bb
 ; GREEDY908-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
 ; GREEDY908-NEXT:    v_mov_b32_e32 v0, 1.0
-; GREEDY908-NEXT:    v_mov_b32_e32 v4, 0
+; GREEDY908-NEXT:    v_mov_b32_e32 v12, 0
 ; GREEDY908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GREEDY908-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
 ; GREEDY908-NEXT:    s_waitcnt lgkmcnt(0)
-; GREEDY908-NEXT:    v_mov_b32_e32 v5, s15
+; GREEDY908-NEXT:    v_mov_b32_e32 v13, s15
 ; GREEDY908-NEXT:    v_mov_b32_e32 v2, s14
 ; GREEDY908-NEXT:    v_mov_b32_e32 v1, s13
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a33, v5
-; GREEDY908-NEXT:    v_mov_b32_e32 v5, s12
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a33, v13
+; GREEDY908-NEXT:    v_mov_b32_e32 v13, s12
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a32, v2
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a31, v1
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a30, v5
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a30, v13
 ; GREEDY908-NEXT:    v_mov_b32_e32 v2, s11
 ; GREEDY908-NEXT:    v_mov_b32_e32 v1, s10
-; GREEDY908-NEXT:    v_mov_b32_e32 v5, s9
+; GREEDY908-NEXT:    v_mov_b32_e32 v13, s9
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a29, v2
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a28, v1
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a27, v5
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a27, v13
 ; GREEDY908-NEXT:    v_mov_b32_e32 v2, s8
 ; GREEDY908-NEXT:    v_mov_b32_e32 v1, s7
-; GREEDY908-NEXT:    v_mov_b32_e32 v5, s6
+; GREEDY908-NEXT:    v_mov_b32_e32 v13, s6
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a26, v2
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a25, v1
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a24, v5
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a24, v13
 ; GREEDY908-NEXT:    v_mov_b32_e32 v2, s5
 ; GREEDY908-NEXT:    v_mov_b32_e32 v1, s4
-; GREEDY908-NEXT:    v_mov_b32_e32 v5, s3
+; GREEDY908-NEXT:    v_mov_b32_e32 v13, s3
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a23, v2
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a22, v1
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a21, v5
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a21, v13
 ; GREEDY908-NEXT:    v_mov_b32_e32 v2, s2
 ; GREEDY908-NEXT:    v_mov_b32_e32 v1, s1
-; GREEDY908-NEXT:    v_mov_b32_e32 v5, s0
+; GREEDY908-NEXT:    v_mov_b32_e32 v13, s0
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a20, v2
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a19, v1
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a18, v5
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a18, v13
 ; GREEDY908-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GREEDY908-NEXT:    s_nop 1
 ; GREEDY908-NEXT:    v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
@@ -679,10 +649,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    s_nop 7
 ; GREEDY908-NEXT:    s_nop 0
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a19
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v3, a18
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v13, a18
 ; GREEDY908-NEXT:    s_nop 0
 ; GREEDY908-NEXT:    v_accvgpr_write_b32 a1, v2
-; GREEDY908-NEXT:    v_accvgpr_write_b32 a0, v3
+; GREEDY908-NEXT:    v_accvgpr_write_b32 a0, v13
 ; GREEDY908-NEXT:    s_nop 0
 ; GREEDY908-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; GREEDY908-NEXT:    s_nop 7
@@ -691,113 +661,167 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a14
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a13
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v0, a12
-; GREEDY908-NEXT:    s_nop 1
-; GREEDY908-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17] offset:48
-; GREEDY908-NEXT:    s_nop 0
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v3, a11
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a10
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a9
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v0, a8
-; GREEDY908-NEXT:    s_nop 1
-; GREEDY908-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17] offset:32
-; GREEDY908-NEXT:    s_nop 0
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v3, a7
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a6
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a5
-; GREEDY908-NEXT:    v_accvgpr_read_b32 v0, a4
-; GREEDY908-NEXT:    s_nop 1
-; GREEDY908-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17] offset:16
-; GREEDY908-NEXT:    s_nop 0
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v7, a11
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v6, a10
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v5, a9
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v4, a8
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v11, a7
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v10, a6
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v9, a5
+; GREEDY908-NEXT:    v_accvgpr_read_b32 v8, a4
+; GREEDY908-NEXT:    global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
+; GREEDY908-NEXT:    global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
+; GREEDY908-NEXT:    global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v3, a3
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v2, a2
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v1, a1
 ; GREEDY908-NEXT:    v_accvgpr_read_b32 v0, a0
 ; GREEDY908-NEXT:    s_nop 1
-; GREEDY908-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
+; GREEDY908-NEXT:    global_store_dwordx4 v12, v[0:3], s[16:17]
 ; GREEDY908-NEXT:    s_endpgm
 ;
 ; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32:
 ; GREEDY90A:       ; %bb.0: ; %bb
 ; GREEDY90A-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GREEDY90A-NEXT:    v_mov_b32_e32 v0, 1.0
-; GREEDY90A-NEXT:    v_mov_b32_e32 v1, 2.0
-; GREEDY90A-NEXT:    v_mov_b32_e32 v2, 0
+; GREEDY90A-NEXT:    v_mov_b32_e32 v16, 1.0
+; GREEDY90A-NEXT:    v_mov_b32_e32 v17, 2.0
 ; GREEDY90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GREEDY90A-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
 ; GREEDY90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a33, s15
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a32, s14
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a31, s13
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a30, s12
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a29, s11
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a28, s10
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a27, s9
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a26, s8
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a25, s7
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a24, s6
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a23, s5
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a22, s4
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a21, s3
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a20, s2
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a19, s1
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a18, s0
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a31, s15
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a30, s14
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a29, s13
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a28, s12
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a27, s11
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a26, s10
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a25, s9
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a24, s8
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a23, s7
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a22, s6
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a21, s5
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a20, s4
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a19, s3
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a18, s2
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a17, s1
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a16, s0
 ; GREEDY90A-NEXT:    s_nop 1
-; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
-; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
+; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[16:31], v16, v17, a[16:31]
+; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v16, v17, a[16:31]
 ; GREEDY90A-NEXT:    s_nop 7
 ; GREEDY90A-NEXT:    s_nop 1
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a0, a18
-; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a1, a19
-; GREEDY90A-NEXT:    s_nop 1
-; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v0, a16
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v1, a17
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v15, a13
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v14, a12
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v13, a11
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v12, a10
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v11, a9
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v10, a8
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v9, a7
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v8, a6
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v7, a5
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v6, a4
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v5, a3
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v4, a2
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v3, a1
+; GREEDY90A-NEXT:    v_accvgpr_read_b32 v2, a0
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a1, v1
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a2, v2
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a3, v3
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a4, v4
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a5, v5
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a6, v6
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a7, v7
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a8, v8
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a9, v9
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a10, v10
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a11, v11
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a12, v12
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a13, v13
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a14, v14
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a15, v15
+; GREEDY90A-NEXT:    v_mov_b32_e32 v0, 0
+; GREEDY90A-NEXT:    s_nop 0
+; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v16, v17, a[0:15]
 ; GREEDY90A-NEXT:    s_nop 7
 ; GREEDY90A-NEXT:    s_nop 2
-; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
-; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
-; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
-; GREEDY90A-NEXT:    global_store_dwordx4 v2, a[0:3], s[16:17]
+; GREEDY90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GREEDY90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GREEDY90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GREEDY90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GREEDY90A-NEXT:    s_endpgm
 ;
 ; GREEDY942-LABEL: test_mfma_f32_16x16x1f32:
 ; GREEDY942:       ; %bb.0: ; %bb
 ; GREEDY942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GREEDY942-NEXT:    v_mov_b32_e32 v0, 1.0
-; GREEDY942-NEXT:    v_mov_b32_e32 v1, 2.0
-; GREEDY942-NEXT:    v_mov_b32_e32 v2, 0
+; GREEDY942-NEXT:    v_mov_b32_e32 v16, 1.0
+; GREEDY942-NEXT:    v_mov_b32_e32 v17, 2.0
 ; GREEDY942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GREEDY942-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
 ; GREEDY942-NEXT:    s_waitcnt lgkmcnt(0)
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a33, s15
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a32, s14
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a31, s13
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a30, s12
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a29, s11
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a28, s10
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a27, s9
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a26, s8
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a25, s7
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a24, s6
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a23, s5
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a22, s4
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a21, s3
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a20, s2
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a19, s1
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a18, s0
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a31, s15
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a30, s14
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a29, s13
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a28, s12
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a27, s11
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a26, s10
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a25, s9
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a24, s8
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a23, s7
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a22, s6
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a21, s5
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a20, s4
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a19, s3
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a18, s2
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a17, s1
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a16, s0
 ; GREEDY942-NEXT:    s_nop 1
-; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33]
-; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33]
+; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[16:31], v16, v17, a[16:31]
+; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v16, v17, a[16:31]
 ; GREEDY942-NEXT:    s_nop 7
 ; GREEDY942-NEXT:    s_nop 0
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a0, a18
-; GREEDY942-NEXT:    v_accvgpr_mov_b32 a1, a19
-; GREEDY942-NEXT:    s_nop 1
-; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v0, a16
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v1, a17
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v15, a13
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v14, a12
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v13, a11
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v12, a10
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v11, a9
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v10, a8
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v9, a7
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v8, a6
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v7, a5
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v6, a4
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v5, a3
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v4, a2
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v3, a1
+; GREEDY942-NEXT:    v_accvgpr_read_b32 v2, a0
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a0, v0
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a1, v1
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a2, v2
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a3, v3
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a4, v4
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a5, v5
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a6, v6
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a7, v7
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a8, v8
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a9, v9
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a10, v10
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a11, v11
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a12, v12
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a13, v13
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a14, v14
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a15, v15
+; GREEDY942-NEXT:    v_mov_b32_e32 v0, 0
+; GREEDY942-NEXT:    s_nop 0
+; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v16, v17, a[0:15]
 ; GREEDY942-NEXT:    s_nop 7
 ; GREEDY942-NEXT:    s_nop 1
-; GREEDY942-NEXT:    global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
-; GREEDY942-NEXT:    global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
-; GREEDY942-NEXT:    global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
-; GREEDY942-NEXT:    global_store_dwordx4 v2, a[0:3], s[16:17]
+; GREEDY942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GREEDY942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GREEDY942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GREEDY942-NEXT:    global_store_dwordx4 v0, a[0:3], s[16:17]
 ; GREEDY942-NEXT:    s_endpgm
 ;
 ; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32:
@@ -857,51 +881,53 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; FAST90A-LABEL: test_mfma_f32_16x16x1f32:
 ; FAST90A:       ; %bb.0: ; %bb
 ; FAST90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; FAST90A-NEXT:    v_mov_b32_e32 v1, 1.0
-; FAST90A-NEXT:    v_mov_b32_e32 v2, 2.0
-; FAST90A-NEXT:    v_mov_b32_e32 v0, 0
+; FAST90A-NEXT:    v_mov_b32_e32 v0, 1.0
+; FAST90A-NEXT:    v_mov_b32_e32 v1, 2.0
 ; FAST90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; FAST90A-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
 ; FAST90A-NEXT:    s_waitcnt lgkmcnt(0)
-; FAST90A-NEXT:    v_accvgpr_write_b32 a0, s4
-; FAST90A-NEXT:    v_accvgpr_write_b32 a1, s5
-; FAST90A-NEXT:    v_accvgpr_write_b32 a2, s6
-; FAST90A-NEXT:    v_accvgpr_write_b32 a3, s7
-; FAST90A-NEXT:    v_accvgpr_write_b32 a4, s8
-; FAST90A-NEXT:    v_accvgpr_write_b32 a5, s9
-; FAST90A-NEXT:    v_accvgpr_write_b32 a6, s10
-; FAST90A-NEXT:    v_accvgpr_write_b32 a7, s11
-; FAST90A-NEXT:    v_accvgpr_write_b32 a8, s12
-; FAST90A-NEXT:    v_accvgpr_write_b32 a9, s13
-; FAST90A-NEXT:    v_accvgpr_write_b32 a10, s14
-; FAST90A-NEXT:    v_accvgpr_write_b32 a11, s15
-; FAST90A-NEXT:    v_accvgpr_write_b32 a12, s16
-; FAST90A-NEXT:    v_accvgpr_write_b32 a13, s17
-; FAST90A-NEXT:    v_accvgpr_write_b32 a14, s18
-; FAST90A-NEXT:    v_accvgpr_write_b32 a15, s19
+; FAST90A-NEXT:    v_accvgpr_write_b32 a47, s19
+; FAST90A-NEXT:    v_accvgpr_write_b32 a46, s18
+; FAST90A-NEXT:    v_accvgpr_write_b32 a45, s17
+; FAST90A-NEXT:    v_accvgpr_write_b32 a44, s16
+; FAST90A-NEXT:    v_accvgpr_write_b32 a43, s15
+; FAST90A-NEXT:    v_accvgpr_write_b32 a42, s14
+; FAST90A-NEXT:    v_accvgpr_write_b32 a41, s13
+; FAST90A-NEXT:    v_accvgpr_write_b32 a40, s12
+; FAST90A-NEXT:    v_accvgpr_write_b32 a39, s11
+; FAST90A-NEXT:    v_accvgpr_write_b32 a38, s10
+; FAST90A-NEXT:    v_accvgpr_write_b32 a37, s9
+; FAST90A-NEXT:    v_accvgpr_write_b32 a36, s8
+; FAST90A-NEXT:    v_accvgpr_write_b32 a35, s7
+; FAST90A-NEXT:    v_accvgpr_write_b32 a34, s6
+; FAST90A-NEXT:    v_accvgpr_write_b32 a33, s5
+; FAST90A-NEXT:    v_accvgpr_write_b32 a32, s4
 ; FAST90A-NEXT:    s_nop 1
-; FAST90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
-; FAST90A-NEXT:    v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15]
+; FAST90A-NEXT:    v_mfma_f32_16x16x1f32 a[32:47], v0, v1, a[32:47]
+; FAST90A-NEXT:    v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[32:47]
 ; FAST90A-NEXT:    s_nop 7
-; FAST90A-NEXT:    s_nop 2
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a2, a16
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a3, a17
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a4, a18
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a5, a19
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a6, a20
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a7, a21
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a8, a22
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a9, a23
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a10, a24
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a11, a25
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a12, a26
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a13, a27
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a14, a28
+; FAST90A-NEXT:    s_nop 1
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a1, a33
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a0, a32
 ; FAST90A-NEXT:    v_accvgpr_mov_b32 a15, a29
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a14, a28
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a13, a27
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a12, a26
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a11, a25
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a10, a24
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a9, a23
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a8, a22
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a7, a21
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a6, a20
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a5, a19
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a4, a18
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a3, a17
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a2, a16
 ; FAST90A-NEXT:    s_nop 1
-; FAST90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
+; FAST90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
+; FAST90A-NEXT:    v_mov_b32_e32 v0, 0
 ; FAST90A-NEXT:    s_nop 7
-; FAST90A-NEXT:    s_nop 2
+; FAST90A-NEXT:    s_nop 1
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
 ; FAST90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16

>From 83abf51aed3ad0aab7ca1fd97a51880b61f9de90 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 7 Apr 2025 17:23:19 -0700
Subject: [PATCH 2/3] Inflate src reg for subreg inserts

Change-Id: I8562c6fae3b4aefd6ddbf3f6dbad18ffa0cf6331
---
 llvm/lib/CodeGen/RegisterCoalescer.cpp        |    7 +-
 .../coalesce-copy-to-agpr-to-av-registers.mir |  156 +-
 .../AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll    |   60 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll |  680 +-
 ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 3746 ++++++----
 ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 6378 +++++++++++------
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       | 3270 +++------
 .../AMDGPU/mfma-no-register-aliasing.ll       |  170 +-
 .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll     |   42 +-
 .../CodeGen/AMDGPU/no-fold-accvgpr-mov.mir    |   14 +-
 10 files changed, 8247 insertions(+), 6276 deletions(-)

diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 15cb68012d0c9..74416cae378ab 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -518,9 +518,10 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
       SrcIdx = DstSub;
       NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
       if (!NewRC) {
-        auto SuperDstRC = MRI.getLargestConstrainedSuperClass(Dst);
-        if (SuperDstRC != DstRC)
-          NewRC = TRI.getMatchingSuperRegClass(SuperDstRC, SrcRC, DstSub);
+        auto SuperSrcRC = MRI.getLargestConstrainedSuperClass(Src);
+        if (SuperSrcRC != SrcRC) {
+          NewRC = TRI.getMatchingSuperRegClass(DstRC, SuperSrcRC, DstSub);
+        }
       }
     } else if (SrcSub) {
       // DstReg will be merged with a sub-register of SrcReg.
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index 3f1965107b361..5043c7b8093b1 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -16,11 +16,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_64 = COPY $vgpr1
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -41,11 +39,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_64_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_64_align2 = COPY $vgpr1
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -66,13 +62,10 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96
     ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY3]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_96 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:areg_96 = COPY $vgpr2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -95,13 +88,10 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2
     ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY3]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_96_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_96_align2 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:areg_96_align2 = COPY $vgpr2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -124,11 +114,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128
     ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:areg_128 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:areg_128 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
@@ -149,11 +137,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2
     ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY $vgpr2_vgpr3
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vreg_64 = COPY $vgpr2_vgpr3
@@ -199,11 +185,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96
     ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:areg_96 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = COPY $vgpr1_vgpr2
@@ -224,11 +208,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2
     ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_96_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY $vgpr1_vgpr2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vreg_64 = COPY $vgpr1_vgpr2
@@ -249,11 +231,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96
     ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:areg_96 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:areg_96 = COPY $vgpr2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vgpr_32 = COPY $vgpr2
@@ -274,11 +254,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2
     ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:areg_96_align2 = COPY $vgpr2
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vreg_64 = COPY $vgpr0_vgpr1
     %1:vgpr_32 = COPY $vgpr2
@@ -299,10 +277,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %2.sub0:areg_64 = COPY %0
@@ -322,11 +299,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_64_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_64_align2 = COPY $vgpr1
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -347,10 +322,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_96 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_96 = COPY %0
@@ -370,10 +344,9 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_96_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_96_align2 = COPY %0
@@ -393,12 +366,11 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY1]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_128 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6029321 /* reguse:AReg_128 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128 = COPY %0
@@ -420,12 +392,11 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY1]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_128_align2 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6291465 /* reguse:AReg_128_Align2 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_128_align2 = COPY %0
@@ -448,10 +419,9 @@ body:             |
     ; CHECK: liveins: $vgpr0, $vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY2]]
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub1:areg_64 = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]]
     ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
@@ -499,12 +469,12 @@ body:             |
     ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_and_vreg64_coalesce_with_av64_other_v_use
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
-    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]]
-    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3538953 /* reguse:VReg_64 */, [[COPY]]
+    ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:areg_64 = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY]]
+    ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3538953 /* reguse:VReg_64 */, [[COPY1]]
     ; CHECK-NEXT: SI_RETURN
     %0:vgpr_32 = COPY $vgpr0
     undef %1.sub0:areg_64 = COPY %0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 452033f332659..ef27cb554d000 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -160,22 +160,22 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
 ; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
 ; GCN-NEXT:    s_nop 7
@@ -205,22 +205,22 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
 ; GCN-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
 ; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
 ; GCN-NEXT:    s_nop 7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 4628a9c15391b..cc738e74df750 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -10,41 +10,73 @@ declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>,
 ; --------------------------------------------------------------------
 
 define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_f32_16x16x32_f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_f32_16x16x32_f16:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_f16:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_16x16x32_f16__flags:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
   ret <4 x float> %result
 }
@@ -408,91 +440,173 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
 }
 
 define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_32x32x16_f16__mac:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1)
   ret <16 x float> %result
 }
@@ -910,41 +1024,73 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
 declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32>, <4 x i32>, <4 x i32>, i32 immarg, i32 immarg, i32 immarg)
 
 define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) {
-; GCN-LABEL: test_mfma_i32_16x16x64_i8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_i32_16x16x64_i8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_i32_16x16x64_i8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0)
   ret <4 x i32> %result
 }
 
 define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2) {
-; GCN-LABEL: test_mfma_i32_16x16x64_i8__flags:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_i32_16x16x64_i8__flags:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 1, i32 1, i32 1)
   ret <4 x i32> %result
 }
@@ -1330,91 +1476,173 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 }
 
 define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) {
-; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_i32_32x32x32_i8__mac:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0)
   ret <16 x i32> %result
 }
 
 define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2) {
-; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 1, i32 1)
   ret <16 x i32> %result
 }
@@ -1859,10 +2087,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
 ; GCN-LABEL: test_mfma_f32_16x16x32_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
 ; GCN-NEXT:    s_nop 7
@@ -1879,10 +2107,10 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
 ; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
 ; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
 ; GCN-NEXT:    s_nop 7
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index 7cc726a3bd79c..1a367fd0afb92 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -14,22 +14,39 @@
 
 ; fp8 x fp8
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 0, ; blgp
@@ -38,22 +55,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 0, ; blgp
@@ -62,22 +96,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 0, ; blgp
@@ -86,22 +137,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 0, ; blgp
@@ -110,22 +178,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 0, ; blgp
@@ -134,22 +219,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 0, ; blgp
@@ -158,46 +260,80 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
-                                                                                      i32 0, ; cbsz
-                                                                                      i32 0, ; blgp
-                                                                                      i32 2, i32 %scale0, i32 3, i32 %scale1)
-  ret <4 x float> %result
-}
-
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+                                                                                      i32 0, ; cbsz
+                                                                                      i32 0, ; blgp
+                                                                                      i32 2, i32 %scale0, i32 3, i32 %scale1)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 0, ; blgp
@@ -207,22 +343,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 0, ; blgp
@@ -232,22 +385,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons
 
 ; fp8 x bf8
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 1, ; blgp
@@ -257,22 +427,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 1, ; blgp
@@ -282,22 +469,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons
 
 ; fp8 x fp6
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 2, ; blgp
@@ -307,22 +511,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 2, ; blgp
@@ -332,22 +553,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons
 
 ; fp8 x bf6
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 3, ; blgp
@@ -357,22 +595,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 3, ; blgp
@@ -382,22 +637,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons
 
 ; fp8 x fp4
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 4, ; blgp
@@ -407,22 +679,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 4, ; blgp
@@ -432,22 +721,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons
 
 ; bf8 x fp8
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
                                                                                       i32 0, ; blgp
@@ -457,22 +763,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
                                                                                       i32 0, ; blgp
@@ -482,22 +805,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons
 
 ; bf8 x bf8
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
                                                                                       i32 1, ; blgp
@@ -508,47 +848,81 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
-                                                                                      i32 1, ; cbsz
-                                                                                      i32 1, ; blgp
-                                                                                      i32 0, i32 0, i32 0, i32 0)
-  ret <4 x float> %result
-}
-
-; bf8 x fp6
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+                                                                                      i32 1, ; cbsz
+                                                                                      i32 1, ; blgp
+                                                                                      i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+; bf8 x fp6
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
                                                                                       i32 2, ; blgp
@@ -557,22 +931,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
                                                                                       i32 2, ; blgp
@@ -582,22 +973,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons
 
 ; bf8 x bf6
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
                                                                                       i32 3, ; blgp
@@ -607,22 +1015,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
                                                                                       i32 3, ; blgp
@@ -632,22 +1057,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons
 
 ; bf8 x fp4
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
                                                                                       i32 4, ; blgp
@@ -657,22 +1099,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 1, ; cbsz
                                                                                       i32 4, ; blgp
@@ -682,22 +1141,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons
 
 ; fp6 x fp8
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 0, ; blgp
@@ -707,22 +1183,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 0, ; blgp
@@ -732,22 +1225,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons
 
 ; fp6 x bf8
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 1, ; blgp
@@ -757,22 +1267,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 1, ; blgp
@@ -782,21 +1309,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons
 
 ; fp6 x fp6
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 2, ; blgp
@@ -806,21 +1349,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 2, ; blgp
@@ -830,21 +1389,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons
 
 ; fp6 x bf6
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 3, ; blgp
@@ -854,21 +1429,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 3, ; blgp
@@ -879,22 +1470,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons
 
 ; bf6 x fp8
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 0, ; blgp
@@ -904,47 +1512,81 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
-                                                                                      i32 3, ; cbsz
-                                                                                      i32 0, ; blgp
-                                                                                      i32 0, i32 0, i32 0, i32 0)
-  ret <4 x float> %result
-}
-
-; bf6 x bf8
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
+                                                                                      i32 3, ; cbsz
+                                                                                      i32 0, ; blgp
+                                                                                      i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+; bf6 x bf8
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 1, ; blgp
@@ -954,22 +1596,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 1, ; blgp
@@ -979,21 +1638,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons
 
 ; bf6 x fp6
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 2, ; blgp
@@ -1003,21 +1678,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 2, ; blgp
@@ -1027,21 +1718,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons
 
 ; bf6 x fp4
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 4, ; blgp
@@ -1051,21 +1758,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 4, ; blgp
@@ -1075,21 +1798,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons
 
 ; bf6 x bf6
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 3, ; blgp
@@ -1099,45 +1838,77 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
-                                                                                      i32 3, ; cbsz
-                                                                                      i32 3, ; blgp
-                                                                                      i32 0, i32 0, i32 0, i32 0)
-  ret <4 x float> %result
-}
-
-; fp6 x fp4
-define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
+                                                                                      i32 3, ; cbsz
+                                                                                      i32 3, ; blgp
+                                                                                      i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %result
+}
+
+; fp6 x fp4
+define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 4, ; blgp
@@ -1147,21 +1918,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 4, ; blgp
@@ -1171,22 +1958,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons
 
 ; fp4 x fp8
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 0, ; blgp
@@ -1196,22 +2000,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 0, ; blgp
@@ -1221,22 +2042,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons
 
 ; fp4 x bf8
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 1, ; blgp
@@ -1246,22 +2084,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 1, ; blgp
@@ -1271,21 +2126,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons
 
 ; fp4 x fp6
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 2, ; blgp
@@ -1295,21 +2166,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 2, ; blgp
@@ -1319,21 +2206,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons
 
 ; fp4 x bf6
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 3, ; blgp
@@ -1343,21 +2246,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 3, ; blgp
@@ -1367,21 +2286,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons
 
 ; fp4 x fp4
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 4, ; blgp
@@ -1391,21 +2326,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 4, ; blgp
@@ -1418,65 +2369,117 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons
 ; --------------------------------------------------------------------
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 inreg %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    v_mov_b32_e32 v16, s1
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v16, s1
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_mov_b32_e32 v16, s1
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v20 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 inreg %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, s0 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <4 x float> %result
 }
@@ -1493,8 +2496,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s19
-; SDAG-NEXT:    v_mov_b32_e32 v20, s28
-; SDAG-NEXT:    v_mov_b32_e32 v21, s29
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v1
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v0
 ; SDAG-NEXT:    v_mov_b32_e32 v4, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s22
@@ -1503,10 +2506,8 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    v_mov_b32_e32 v9, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v10, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v11, s27
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v0
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v1
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, s28
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, s29
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[12:19], v[4:11], a[0:3], v2, v3 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
@@ -1528,16 +2529,14 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[18:19]
-; GISEL-NEXT:    v_mov_b32_e32 v20, s28
-; GISEL-NEXT:    v_mov_b32_e32 v21, s29
 ; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v0
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v1
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, s28
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, s29
 ; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[24:25]
 ; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[26:27]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v20
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v21
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v0
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v1
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
@@ -1563,10 +2562,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v20, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v21, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], s20, v12 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
@@ -1617,10 +2616,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v20, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v21, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, s20 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
@@ -1663,6 +2662,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s2
@@ -1671,10 +2674,6 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v20, s18
 ; SDAG-NEXT:    v_mov_b32_e32 v21, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
@@ -1693,13 +2692,13 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
 ; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
 ; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
 ; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
 ; GISEL-NEXT:    s_nop 1
 ; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, s20 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
@@ -1789,22 +2788,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 33, -2 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
   ret <4 x float> %result
 }
@@ -1813,12 +2829,12 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_movk_i32 s0, 0x41
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_movk_i32 s0, 0x41
+; SDAG-NEXT:    s_nop 0
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, -2 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -1853,11 +2869,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
 ; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    s_movk_i32 s0, 0x41
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
 ; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_movk_i32 s0, 0x41
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0x4d
 ; SDAG-NEXT:    s_nop 1
 ; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], s0, v16 op_sel_hi:[0,0,0]
@@ -2026,86 +3042,154 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 
 ; This should be optimized to avoid the scale
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
   ret <4 x float> %result
 }
 
 ; This should be optimized to avoid the scale, with non-0 op_sel arguments.
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 1 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0]
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 1, 0 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
   ret <4 x float> %result
 }
@@ -2115,22 +3199,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
 ; --------------------------------------------------------------------
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 2, ; blgp
@@ -2139,22 +3240,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 0, ; blgp
@@ -2163,21 +3281,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 2, ; blgp
@@ -2186,21 +3320,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 2, ; blgp
@@ -2209,22 +3359,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 4, ; blgp
@@ -2233,22 +3400,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 0, ; blgp
@@ -2257,22 +3441,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 4, ; blgp
@@ -2281,22 +3482,39 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 0, ; blgp
@@ -2305,21 +3523,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 4, ; blgp
@@ -2328,21 +3562,37 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(
 }
 
 define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 4, ; blgp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 1ab7de7cda935..a4b646b49bef9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -990,331 +990,29 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons
 
 ; fp8 x fp6
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v31, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 0, ; cbsz
-                                                                                      i32 2, ; blgp
-                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
-  ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 0, ; cbsz
-                                                                                      i32 2, ; blgp
-                                                                                      i32 0, i32 0, i32 0, i32 0)
-  ret <16 x float> %result
-}
-
-; fp8 x bf6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v31, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 0, ; cbsz
-                                                                                      i32 3, ; blgp
-                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
-  ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 0, ; cbsz
-                                                                                      i32 3, ; blgp
-                                                                                      i32 0, i32 0, i32 0, i32 0)
-  ret <16 x float> %result
-}
-
-; fp8 x fp4
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 0, ; cbsz
-                                                                                      i32 4, ; blgp
-                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
-  ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 0, ; cbsz
-                                                                                      i32 4, ; blgp
-                                                                                      i32 0, i32 0, i32 0, i32 0)
-  ret <16 x float> %result
-}
-
-; bf8 x fp8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    scratch_load_dword a15, off, s32
-; SDAG-NEXT:    scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT:    scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    scratch_load_dword v14, off, s32
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:2
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -1336,30 +1034,29 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
 ; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    scratch_load_dword a15, off, s32
-; GISEL-NEXT:    scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT:    scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    scratch_load_dword v14, off, s32
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:2
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 3
@@ -1380,89 +1077,129 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
 ; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
 ; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 1, ; cbsz
-                                                                                      i32 0, ; blgp
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 0, ; cbsz
+                                                                                      i32 2, ; blgp
                                                                                       i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <16 x float> %result
 }
 
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword a15, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v29
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v30
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 1, ; cbsz
-                                                                                      i32 0, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 0, ; cbsz
+                                                                                      i32 2, ; blgp
                                                                                       i32 0, i32 0, i32 0, i32 0)
   ret <16 x float> %result
 }
 
-; bf8 x bf8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
+; fp8 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    scratch_load_dword a15, off, s32
-; SDAG-NEXT:    scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT:    scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    scratch_load_dword v14, off, s32
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:3
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -1484,30 +1221,29 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x
 ; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    scratch_load_dword a15, off, s32
-; GISEL-NEXT:    scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT:    scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    scratch_load_dword v14, off, s32
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:3
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 3
@@ -1528,190 +1264,408 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x
 ; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
 ; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 1, ; cbsz
-                                                                                      i32 1, ; blgp
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 0, ; cbsz
+                                                                                      i32 3, ; blgp
                                                                                       i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <16 x float> %result
 }
 
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword a15, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v29
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v30
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 1, ; cbsz
-                                                                                      i32 1, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 0, ; cbsz
+                                                                                      i32 3, ; blgp
                                                                                       i32 0, i32 0, i32 0, i32 0)
   ret <16 x float> %result
 }
 
-; bf8 x fp6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v31, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 1, ; cbsz
-                                                                                      i32 2, ; blgp
-                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
+; fp8 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 0, ; cbsz
+                                                                                      i32 4, ; blgp
+                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <16 x float> %result
 }
 
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 1, ; cbsz
-                                                                                      i32 2, ; blgp
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 0, ; cbsz
+                                                                                      i32 4, ; blgp
                                                                                       i32 0, i32 0, i32 0, i32 0)
   ret <16 x float> %result
 }
 
-; bf8 x bf6
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3:
+; bf8 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    scratch_load_dword a15, off, s32
+; SDAG-NEXT:    scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT:    scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    scratch_load_dword a15, off, s32
+; GISEL-NEXT:    scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT:    scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 1, ; cbsz
+                                                                                      i32 0, ; blgp
+                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v31, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
+; GCN-NEXT:    scratch_load_dword a15, off, s32
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v27
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v28
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v29
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v30
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
@@ -1732,235 +1686,135 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x
 ; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
 ; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 1, ; cbsz
-                                                                                      i32 3, ; blgp
-                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
+                                                                                      i32 0, ; blgp
+                                                                                      i32 0, i32 0, i32 0, i32 0)
   ret <16 x float> %result
 }
 
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 1, ; cbsz
-                                                                                      i32 3, ; blgp
-                                                                                      i32 0, i32 0, i32 0, i32 0)
-  ret <16 x float> %result
-}
-
-; bf8 x fp4
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+; bf8 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    scratch_load_dword a15, off, s32
+; SDAG-NEXT:    scratch_load_dword v31, off, s32 offset:8
+; SDAG-NEXT:    scratch_load_dword v32, off, s32 offset:4
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    scratch_load_dword a15, off, s32
+; GISEL-NEXT:    scratch_load_dword v31, off, s32 offset:4
+; GISEL-NEXT:    scratch_load_dword v32, off, s32 offset:8
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 1, ; cbsz
-                                                                                      i32 4, ; blgp
+                                                                                      i32 1, ; blgp
                                                                                       i32 0, i32 %scale0, i32 0, i32 %scale1)
   ret <16 x float> %result
 }
 
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 1, ; cbsz
-                                                                                      i32 4, ; blgp
-                                                                                      i32 0, i32 0, i32 0, i32 0)
-  ret <16 x float> %result
-}
 
-; fp6 x fp8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0:
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v31, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
+; GCN-NEXT:    scratch_load_dword a15, off, s32
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v27
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v28
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v29
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v30
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 2, ; cbsz
-                                                                                      i32 0, ; blgp
-                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
-  ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
+; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
@@ -1981,58 +1835,846 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons
 ; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
 ; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 2, ; cbsz
-                                                                                      i32 0, ; blgp
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 1, ; cbsz
+                                                                                      i32 1, ; blgp
                                                                                       i32 0, i32 0, i32 0, i32 0)
   ret <16 x float> %result
 }
 
-; fp6 x bf8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v31, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; bf8 x fp6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    scratch_load_dword v14, off, s32
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    scratch_load_dword v14, off, s32
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 1, ; cbsz
+                                                                                      i32 2, ; blgp
+                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 1, ; cbsz
+                                                                                      i32 2, ; blgp
+                                                                                      i32 0, i32 0, i32 0, i32 0)
+  ret <16 x float> %result
+}
+
+; bf8 x bf6
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    scratch_load_dword v14, off, s32
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    scratch_load_dword v14, off, s32
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 1, ; cbsz
+                                                                                      i32 3, ; blgp
+                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 1, ; cbsz
+                                                                                      i32 3, ; blgp
+                                                                                      i32 0, i32 0, i32 0, i32 0)
+  ret <16 x float> %result
+}
+
+; bf8 x fp4
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 1, ; cbsz
+                                                                                      i32 4, ; blgp
+                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 1, ; cbsz
+                                                                                      i32 4, ; blgp
+                                                                                      i32 0, i32 0, i32 0, i32 0)
+  ret <16 x float> %result
+}
+
+; fp6 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    scratch_load_dword v14, off, s32
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    scratch_load_dword v14, off, s32
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 2, ; cbsz
+                                                                                      i32 0, ; blgp
+                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 2, ; cbsz
+                                                                                      i32 0, ; blgp
+                                                                                      i32 0, i32 0, i32 0, i32 0)
+  ret <16 x float> %result
+}
+
+; fp6 x bf8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    scratch_load_dword v14, off, s32
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    scratch_load_dword v14, off, s32
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 1, ; blgp
@@ -2041,47 +2683,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 1, ; blgp
@@ -2091,46 +2775,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons
 
 ; fp6 x fp6
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 2, ; blgp
@@ -2139,46 +2864,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 2, ; blgp
@@ -2188,46 +2954,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons
 
 ; fp6 x bf6
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 3, ; blgp
@@ -2236,46 +3043,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 3, ; blgp
@@ -2286,98 +3134,184 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons
 
 ; bf6 x fp8
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v31, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
-                                                                                      i32 3, ; cbsz
-                                                                                      i32 0, ; blgp
-                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
-  ret <16 x float> %result
-}
-
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    scratch_load_dword v14, off, s32
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    scratch_load_dword v14, off, s32
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
+                                                                                      i32 3, ; cbsz
+                                                                                      i32 0, ; blgp
+                                                                                      i32 0, i32 %scale0, i32 0, i32 %scale1)
+  ret <16 x float> %result
+}
+
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 0, ; blgp
@@ -2387,49 +3321,93 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons
 
 ; bf6 x bf8
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v31, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    scratch_load_dword v14, off, s32
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    scratch_load_dword v14, off, s32
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 1, ; blgp
@@ -2438,47 +3416,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 1, ; blgp
@@ -2488,46 +3508,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons
 
 ; bf6 x fp6
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 2, ; blgp
@@ -2536,46 +3597,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 2, ; blgp
@@ -2585,46 +3687,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons
 
 ; bf6 x fp4
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 4, ; blgp
@@ -2633,46 +3776,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 4, ; blgp
@@ -2682,46 +3866,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons
 
 ; bf6 x bf6
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 3, ; blgp
@@ -2730,46 +3955,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 3, ; cbsz
                                                                                       i32 3, ; blgp
@@ -2779,46 +4045,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons
 
 ; fp6 x fp4
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 2, ; cbsz
                                                                                       i32 4, ; blgp
@@ -2827,96 +4134,179 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 2, ; cbsz
-                                                                                      i32 4, ; blgp
-                                                                                      i32 0, i32 0, i32 0, i32 0)
-  ret <16 x float> %result
-}
-
-; fp4 x fp8
-define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+                                                                                      i32 4, ; blgp
+                                                                                      i32 0, i32 0, i32 0, i32 0)
+  ret <16 x float> %result
+}
+
+; fp4 x fp8
+define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 0, ; blgp
@@ -2925,47 +4315,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 0, ; blgp
@@ -2975,47 +4407,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons
 
 ; fp4 x bf8
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 1, ; blgp
@@ -3024,47 +4498,89 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v27
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 1, ; blgp
@@ -3074,46 +4590,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons
 
 ; fp4 x fp6
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 2, ; blgp
@@ -3122,46 +4679,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 2, ; blgp
@@ -3171,46 +4769,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons
 
 ; fp4 x bf6
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 3, ; blgp
@@ -3219,46 +4858,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v25
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v10
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v25
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 3, ; blgp
@@ -3268,46 +4948,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons
 
 ; fp4 x fp4
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 4, ; blgp
@@ -3316,46 +5037,87 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v23
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    s_nop 1
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v14
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT:    s_nop 1
+; GISEL-NEXT:    v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 4, ; blgp
@@ -3515,31 +5277,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v32, s0
-; SDAG-NEXT:    v_mov_b32_e32 v33, s1
-; SDAG-NEXT:    v_mov_b32_e32 v34, s2
-; SDAG-NEXT:    v_mov_b32_e32 v35, s3
-; SDAG-NEXT:    v_mov_b32_e32 v36, s16
-; SDAG-NEXT:    v_mov_b32_e32 v37, s17
-; SDAG-NEXT:    v_mov_b32_e32 v38, s18
-; SDAG-NEXT:    v_mov_b32_e32 v39, s19
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v31, v13
-; SDAG-NEXT:    v_mov_b32_e32 v30, v12
-; SDAG-NEXT:    v_mov_b32_e32 v29, v11
-; SDAG-NEXT:    v_mov_b32_e32 v28, v10
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT:    v_mov_b32_e32 v16, s0
+; SDAG-NEXT:    v_mov_b32_e32 v17, s1
+; SDAG-NEXT:    v_mov_b32_e32 v18, s2
+; SDAG-NEXT:    v_mov_b32_e32 v19, s3
+; SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; SDAG-NEXT:    v_mov_b32_e32 v21, s17
+; SDAG-NEXT:    v_mov_b32_e32 v22, s18
+; SDAG-NEXT:    v_mov_b32_e32 v23, s19
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v8
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v7
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v6
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v5
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v4
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v3
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v2
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v1
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v0
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, s28
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, s29
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s22
@@ -3548,23 +5309,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s27
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v31
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -3593,48 +5339,32 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 ; GISEL-NEXT:    s_mov_b32 s13, s1
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
-; GISEL-NEXT:    v_mov_b32_e32 v18, v0
-; GISEL-NEXT:    v_mov_b32_e32 v19, v1
-; GISEL-NEXT:    v_mov_b32_e32 v20, v2
-; GISEL-NEXT:    v_mov_b32_e32 v21, v3
-; GISEL-NEXT:    v_mov_b32_e32 v22, v4
-; GISEL-NEXT:    v_mov_b32_e32 v23, v5
-; GISEL-NEXT:    v_mov_b32_e32 v24, v6
-; GISEL-NEXT:    v_mov_b32_e32 v25, v7
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v0
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v1
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v2
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v3
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v4
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v5
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v6
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v7
 ; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
 ; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[14:15]
 ; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[16:17]
 ; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[18:19]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s28
-; GISEL-NEXT:    v_mov_b32_e32 v26, v8
-; GISEL-NEXT:    v_mov_b32_e32 v27, v9
-; GISEL-NEXT:    v_mov_b32_e32 v28, v10
-; GISEL-NEXT:    v_mov_b32_e32 v29, v11
-; GISEL-NEXT:    v_mov_b32_e32 v30, v12
-; GISEL-NEXT:    v_mov_b32_e32 v31, v13
-; GISEL-NEXT:    v_mov_b32_e32 v17, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[26:27]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[20:21]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, v31
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[20:21]
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, s28
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[22:23]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[24:25]
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[26:27]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 3
@@ -3662,33 +5392,33 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr(<8 x i32> inreg %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 inreg %scale0, i32 %scale1) {
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v26, s0
-; SDAG-NEXT:    v_mov_b32_e32 v27, s1
-; SDAG-NEXT:    v_mov_b32_e32 v28, s2
-; SDAG-NEXT:    v_mov_b32_e32 v29, s3
-; SDAG-NEXT:    v_mov_b32_e32 v30, s16
-; SDAG-NEXT:    v_mov_b32_e32 v31, s17
-; SDAG-NEXT:    v_mov_b32_e32 v32, s18
-; SDAG-NEXT:    v_mov_b32_e32 v33, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    v_mov_b32_e32 v8, s0
+; SDAG-NEXT:    v_mov_b32_e32 v9, s1
+; SDAG-NEXT:    v_mov_b32_e32 v10, s2
+; SDAG-NEXT:    v_mov_b32_e32 v11, s3
+; SDAG-NEXT:    v_mov_b32_e32 v12, s16
+; SDAG-NEXT:    v_mov_b32_e32 v13, s17
+; SDAG-NEXT:    v_mov_b32_e32 v14, s18
+; SDAG-NEXT:    v_mov_b32_e32 v15, s19
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v16
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[8:15], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -3717,10 +5447,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; GISEL-NEXT:    s_mov_b32 s13, s1
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[12:13]
 ; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
@@ -3729,6 +5455,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a5, v13
 ; GISEL-NEXT:    v_accvgpr_write_b32 a6, v14
 ; GISEL-NEXT:    v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[18:19]
 ; GISEL-NEXT:    v_accvgpr_write_b32 a8, v16
 ; GISEL-NEXT:    v_accvgpr_write_b32 a9, v17
 ; GISEL-NEXT:    v_accvgpr_write_b32 a10, v18
@@ -3738,7 +5468,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a14, v22
 ; GISEL-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[8:15], v[0:7], a[0:15], s20, v24 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 3
@@ -3767,32 +5497,32 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v26, s0
-; SDAG-NEXT:    v_mov_b32_e32 v27, s1
-; SDAG-NEXT:    v_mov_b32_e32 v28, s2
-; SDAG-NEXT:    v_mov_b32_e32 v29, s3
-; SDAG-NEXT:    v_mov_b32_e32 v30, s16
-; SDAG-NEXT:    v_mov_b32_e32 v31, s17
-; SDAG-NEXT:    v_mov_b32_e32 v32, s18
-; SDAG-NEXT:    v_mov_b32_e32 v33, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v14
 ; SDAG-NEXT:    v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    v_mov_b32_e32 v8, s0
+; SDAG-NEXT:    v_mov_b32_e32 v9, s1
+; SDAG-NEXT:    v_mov_b32_e32 v10, s2
+; SDAG-NEXT:    v_mov_b32_e32 v11, s3
+; SDAG-NEXT:    v_mov_b32_e32 v12, s16
+; SDAG-NEXT:    v_mov_b32_e32 v13, s17
+; SDAG-NEXT:    v_mov_b32_e32 v14, s18
+; SDAG-NEXT:    v_mov_b32_e32 v15, s19
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v16
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[8:15], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -3821,10 +5551,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; GISEL-NEXT:    s_mov_b32 s13, s1
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[12:13]
 ; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
@@ -3833,6 +5559,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a5, v13
 ; GISEL-NEXT:    v_accvgpr_write_b32 a6, v14
 ; GISEL-NEXT:    v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[18:19]
 ; GISEL-NEXT:    v_accvgpr_write_b32 a8, v16
 ; GISEL-NEXT:    v_accvgpr_write_b32 a9, v17
 ; GISEL-NEXT:    v_accvgpr_write_b32 a10, v18
@@ -3842,7 +5572,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a14, v22
 ; GISEL-NEXT:    v_accvgpr_write_b32 a15, v23
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[8:15], v[0:7], a[0:15], v24, s20 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 3
@@ -3871,32 +5601,32 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT:    v_mov_b32_e32 v26, s0
-; SDAG-NEXT:    v_mov_b32_e32 v27, s1
-; SDAG-NEXT:    v_mov_b32_e32 v28, s2
-; SDAG-NEXT:    v_mov_b32_e32 v29, s3
-; SDAG-NEXT:    v_mov_b32_e32 v30, s16
-; SDAG-NEXT:    v_mov_b32_e32 v31, s17
-; SDAG-NEXT:    v_mov_b32_e32 v32, s18
-; SDAG-NEXT:    v_mov_b32_e32 v33, s19
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
 ; SDAG-NEXT:    v_accvgpr_write_b32 a15, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v15
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v14
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
+; SDAG-NEXT:    v_mov_b32_e32 v8, s0
+; SDAG-NEXT:    v_mov_b32_e32 v9, s1
+; SDAG-NEXT:    v_mov_b32_e32 v10, s2
+; SDAG-NEXT:    v_mov_b32_e32 v11, s3
+; SDAG-NEXT:    v_mov_b32_e32 v12, s16
+; SDAG-NEXT:    v_mov_b32_e32 v13, s17
+; SDAG-NEXT:    v_mov_b32_e32 v14, s18
+; SDAG-NEXT:    v_mov_b32_e32 v15, s19
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v24, s20 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -3925,11 +5655,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    s_mov_b32 s13, s1
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[18:19]
 ; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[12:13]
 ; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
 ; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
 ; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
@@ -3937,6 +5663,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a5, v13
 ; GISEL-NEXT:    v_accvgpr_write_b32 a6, v14
 ; GISEL-NEXT:    v_accvgpr_write_b32 a7, v15
+; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[12:13]
 ; GISEL-NEXT:    v_accvgpr_write_b32 a8, v16
 ; GISEL-NEXT:    v_accvgpr_write_b32 a9, v17
 ; GISEL-NEXT:    v_accvgpr_write_b32 a10, v18
@@ -3945,8 +5672,11 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
 ; GISEL-NEXT:    v_accvgpr_write_b32 a13, v21
 ; GISEL-NEXT:    v_accvgpr_write_b32 a14, v22
 ; GISEL-NEXT:    v_accvgpr_write_b32 a15, v23
+; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[18:19]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, s20 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v24, s20 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 3
@@ -4021,48 +5751,32 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v32, s0
-; SDAG-NEXT:    v_mov_b32_e32 v33, s1
-; SDAG-NEXT:    v_mov_b32_e32 v34, s2
-; SDAG-NEXT:    v_mov_b32_e32 v35, s3
-; SDAG-NEXT:    v_mov_b32_e32 v36, s16
-; SDAG-NEXT:    v_mov_b32_e32 v37, s17
-; SDAG-NEXT:    v_mov_b32_e32 v38, s18
-; SDAG-NEXT:    v_mov_b32_e32 v39, s19
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v31, v13
-; SDAG-NEXT:    v_mov_b32_e32 v30, v12
-; SDAG-NEXT:    v_mov_b32_e32 v29, v11
-; SDAG-NEXT:    v_mov_b32_e32 v28, v10
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    v_mov_b32_e32 v20, s24
-; SDAG-NEXT:    v_mov_b32_e32 v21, s25
-; SDAG-NEXT:    v_mov_b32_e32 v22, s26
-; SDAG-NEXT:    v_mov_b32_e32 v23, s27
-; SDAG-NEXT:    v_mov_b32_e32 v24, s28
-; SDAG-NEXT:    v_mov_b32_e32 v25, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v30
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v31
+; SDAG-NEXT:    v_mov_b32_e32 v16, s0
+; SDAG-NEXT:    v_mov_b32_e32 v17, s1
+; SDAG-NEXT:    v_mov_b32_e32 v18, s2
+; SDAG-NEXT:    v_mov_b32_e32 v19, s3
+; SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; SDAG-NEXT:    v_mov_b32_e32 v21, s17
+; SDAG-NEXT:    v_mov_b32_e32 v22, s18
+; SDAG-NEXT:    v_mov_b32_e32 v23, s19
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v13
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v12
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v11
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v10
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v9
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v8
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, s20
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, s21
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, s22
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, s23
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, s24
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, s25
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, s26
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, s27
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, s28
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, s29
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
@@ -4091,44 +5805,28 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
 ; GISEL-NEXT:    s_mov_b32 s13, s1
 ; GISEL-NEXT:    s_mov_b32 s14, s2
 ; GISEL-NEXT:    s_mov_b32 s15, s3
-; GISEL-NEXT:    v_mov_b64_e32 v[38:39], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[32:33], s[12:13]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s20
-; GISEL-NEXT:    v_mov_b32_e32 v26, v8
-; GISEL-NEXT:    v_mov_b32_e32 v27, v9
-; GISEL-NEXT:    v_mov_b32_e32 v28, v10
-; GISEL-NEXT:    v_mov_b32_e32 v29, v11
-; GISEL-NEXT:    v_mov_b32_e32 v30, v12
-; GISEL-NEXT:    v_mov_b32_e32 v31, v13
-; GISEL-NEXT:    v_mov_b32_e32 v17, s21
-; GISEL-NEXT:    v_mov_b32_e32 v18, s22
-; GISEL-NEXT:    v_mov_b32_e32 v19, s23
-; GISEL-NEXT:    v_mov_b32_e32 v20, s24
-; GISEL-NEXT:    v_mov_b32_e32 v21, s25
-; GISEL-NEXT:    v_mov_b32_e32 v22, s26
-; GISEL-NEXT:    v_mov_b32_e32 v23, s27
-; GISEL-NEXT:    v_mov_b32_e32 v24, s28
-; GISEL-NEXT:    v_mov_b32_e32 v25, s29
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT:    v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT:    v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT:    v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT:    v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT:    v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT:    v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT:    v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT:    v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT:    v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT:    v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT:    v_accvgpr_write_b32 a14, v30
-; GISEL-NEXT:    v_accvgpr_write_b32 a15, v31
+; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[16:17]
+; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v8
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v9
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v10
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v11
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v12
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, s20
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, s21
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, s22
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, s23
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, s24
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, s25
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, s26
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, s27
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, s28
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, s29
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 7
 ; GISEL-NEXT:    s_nop 3
@@ -5901,49 +7599,93 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v31, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    scratch_load_dword v14, off, s32
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    scratch_load_dword v14, off, s32
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 0, ; cbsz
                                                                                       i32 4, ; blgp
@@ -5952,49 +7694,93 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(
 }
 
 define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v31, off, s32
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v27
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v28
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v29
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_accvgpr_write_b32 a0, v14
+; SDAG-NEXT:    scratch_load_dword v14, off, s32
+; SDAG-NEXT:    v_accvgpr_write_b32 a15, v29
+; SDAG-NEXT:    v_accvgpr_write_b32 a14, v28
+; SDAG-NEXT:    v_accvgpr_write_b32 a13, v27
+; SDAG-NEXT:    v_accvgpr_write_b32 a12, v26
+; SDAG-NEXT:    v_accvgpr_write_b32 a11, v25
+; SDAG-NEXT:    v_accvgpr_write_b32 a10, v24
+; SDAG-NEXT:    v_accvgpr_write_b32 a9, v23
+; SDAG-NEXT:    v_accvgpr_write_b32 a8, v22
+; SDAG-NEXT:    v_accvgpr_write_b32 a7, v21
+; SDAG-NEXT:    v_accvgpr_write_b32 a6, v20
+; SDAG-NEXT:    v_accvgpr_write_b32 a5, v19
+; SDAG-NEXT:    v_accvgpr_write_b32 a4, v18
+; SDAG-NEXT:    v_accvgpr_write_b32 a3, v17
+; SDAG-NEXT:    v_accvgpr_write_b32 a2, v16
+; SDAG-NEXT:    v_accvgpr_write_b32 a1, v15
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    s_nop 0
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 7
+; SDAG-NEXT:    s_nop 3
+; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_accvgpr_write_b32 a0, v14
+; GISEL-NEXT:    scratch_load_dword v14, off, s32
+; GISEL-NEXT:    v_accvgpr_write_b32 a1, v15
+; GISEL-NEXT:    v_accvgpr_write_b32 a2, v16
+; GISEL-NEXT:    v_accvgpr_write_b32 a3, v17
+; GISEL-NEXT:    v_accvgpr_write_b32 a4, v18
+; GISEL-NEXT:    v_accvgpr_write_b32 a5, v19
+; GISEL-NEXT:    v_accvgpr_write_b32 a6, v20
+; GISEL-NEXT:    v_accvgpr_write_b32 a7, v21
+; GISEL-NEXT:    v_accvgpr_write_b32 a8, v22
+; GISEL-NEXT:    v_accvgpr_write_b32 a9, v23
+; GISEL-NEXT:    v_accvgpr_write_b32 a10, v24
+; GISEL-NEXT:    v_accvgpr_write_b32 a11, v25
+; GISEL-NEXT:    v_accvgpr_write_b32 a12, v26
+; GISEL-NEXT:    v_accvgpr_write_b32 a13, v27
+; GISEL-NEXT:    v_accvgpr_write_b32 a14, v28
+; GISEL-NEXT:    v_accvgpr_write_b32 a15, v29
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_nop 0
+; GISEL-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v14 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
+; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
+; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
+; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT:    v_accvgpr_read_b32 v4, a4
+; GISEL-NEXT:    v_accvgpr_read_b32 v5, a5
+; GISEL-NEXT:    v_accvgpr_read_b32 v6, a6
+; GISEL-NEXT:    v_accvgpr_read_b32 v7, a7
+; GISEL-NEXT:    v_accvgpr_read_b32 v8, a8
+; GISEL-NEXT:    v_accvgpr_read_b32 v9, a9
+; GISEL-NEXT:    v_accvgpr_read_b32 v10, a10
+; GISEL-NEXT:    v_accvgpr_read_b32 v11, a11
+; GISEL-NEXT:    v_accvgpr_read_b32 v12, a12
+; GISEL-NEXT:    v_accvgpr_read_b32 v13, a13
+; GISEL-NEXT:    v_accvgpr_read_b32 v14, a14
+; GISEL-NEXT:    v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
                                                                                       i32 4, ; cbsz
                                                                                       i32 0, ; blgp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 08167cd226a4c..ea4560a6c1c85 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -72,94 +72,46 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_f16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_f16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -297,217 +249,85 @@ bb:
 }
 
 define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_f16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
 }
@@ -520,7 +340,6 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, v8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, v7
@@ -531,12 +350,12 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-NEXT:    v_mov_b32_e32 v20, v2
 ; SDAG-NEXT:    v_mov_b32_e32 v19, v1
 ; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s18
@@ -545,79 +364,73 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 a[0:15], v[28:31], v[0:7], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v18, v0
+; GISEL-NEXT:    v_mov_b32_e32 v19, v1
+; GISEL-NEXT:    v_mov_b32_e32 v20, v2
+; GISEL-NEXT:    v_mov_b32_e32 v21, v3
+; GISEL-NEXT:    v_mov_b32_e32 v22, v4
+; GISEL-NEXT:    v_mov_b32_e32 v23, v5
+; GISEL-NEXT:    v_mov_b32_e32 v24, v6
+; GISEL-NEXT:    v_mov_b32_e32 v25, v7
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT:    v_mov_b32_e32 v26, v8
+; GISEL-NEXT:    v_mov_b32_e32 v27, v9
+; GISEL-NEXT:    v_mov_b32_e32 v12, s24
+; GISEL-NEXT:    v_mov_b32_e32 v13, s25
+; GISEL-NEXT:    v_mov_b32_e32 v14, s26
+; GISEL-NEXT:    v_mov_b32_e32 v15, s27
+; GISEL-NEXT:    v_mov_b32_e32 v16, s28
+; GISEL-NEXT:    v_mov_b32_e32 v17, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[22:23]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[0:15], v[34:37], v[48:55], v16
+; GISEL-NEXT:    v_smfmac_f32_32x32x32_f16 v[12:27], v[28:31], v[0:7], v10
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -668,17 +481,12 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo
 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
@@ -688,17 +496,12 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1
 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
@@ -708,17 +511,12 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1
 ; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
 ; GCN-NEXT:    s_nop 7
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
@@ -809,42 +607,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -854,42 +635,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
@@ -899,42 +663,25 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
 ; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
-; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
@@ -948,7 +695,6 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-NEXT:    v_mov_b32_e32 v29, s1
 ; GCN-NEXT:    v_mov_b32_e32 v30, s2
 ; GCN-NEXT:    v_mov_b32_e32 v31, s3
-; GCN-NEXT:    v_mov_b32_e32 v12, s24
 ; GCN-NEXT:    v_mov_b32_e32 v27, v9
 ; GCN-NEXT:    v_mov_b32_e32 v26, v8
 ; GCN-NEXT:    v_mov_b32_e32 v25, v7
@@ -959,12 +705,12 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-NEXT:    v_mov_b32_e32 v20, v2
 ; GCN-NEXT:    v_mov_b32_e32 v19, v1
 ; GCN-NEXT:    v_mov_b32_e32 v18, v0
+; GCN-NEXT:    v_mov_b32_e32 v12, s24
 ; GCN-NEXT:    v_mov_b32_e32 v13, s25
 ; GCN-NEXT:    v_mov_b32_e32 v14, s26
 ; GCN-NEXT:    v_mov_b32_e32 v15, s27
 ; GCN-NEXT:    v_mov_b32_e32 v16, s28
 ; GCN-NEXT:    v_mov_b32_e32 v17, s29
-; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
 ; GCN-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s17
 ; GCN-NEXT:    v_mov_b32_e32 v2, s18
@@ -973,41 +719,26 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
 ; GCN-NEXT:    v_mov_b32_e32 v5, s21
 ; GCN-NEXT:    v_mov_b32_e32 v6, s22
 ; GCN-NEXT:    v_mov_b32_e32 v7, s23
-; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
-; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
-; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
-; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
-; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
-; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
-; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
-; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
-; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
-; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
-; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
-; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
-; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
-; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
-; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
 ; GCN-NEXT:    s_nop 1
-; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[12:27], v[28:31], v[0:7], v10
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 3
-; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
-; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
-; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
-; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
-; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
-; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
-; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
-; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
-; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
-; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
-; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
-; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
-; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
-; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
-; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
-; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -1087,94 +818,46 @@ bb:
 }
 
 define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_i32_16x16x128_i8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_i32_16x16x128_i8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_i32_16x16x128_i8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x i32> %result
 }
 
 define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x i32> %result
 }
 
 define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x i32> %result
 }
@@ -1318,217 +1001,85 @@ bb:
 }
 
 define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_i32_32x32x64_i8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
+; GCN-LABEL: test_smfmac_i32_32x32x64_i8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x i32> %result
 }
 
 define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x i32> %result
 }
 
 define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x i32> %result
 }
@@ -1541,7 +1092,6 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, v8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, v7
@@ -1552,12 +1102,12 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v20, v2
 ; SDAG-NEXT:    v_mov_b32_e32 v19, v1
 ; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s18
@@ -1566,79 +1116,73 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 a[0:15], v[28:31], v[0:7], v10
+; SDAG-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v18, v0
+; GISEL-NEXT:    v_mov_b32_e32 v19, v1
+; GISEL-NEXT:    v_mov_b32_e32 v20, v2
+; GISEL-NEXT:    v_mov_b32_e32 v21, v3
+; GISEL-NEXT:    v_mov_b32_e32 v22, v4
+; GISEL-NEXT:    v_mov_b32_e32 v23, v5
+; GISEL-NEXT:    v_mov_b32_e32 v24, v6
+; GISEL-NEXT:    v_mov_b32_e32 v25, v7
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT:    v_mov_b32_e32 v26, v8
+; GISEL-NEXT:    v_mov_b32_e32 v27, v9
+; GISEL-NEXT:    v_mov_b32_e32 v12, s24
+; GISEL-NEXT:    v_mov_b32_e32 v13, s25
+; GISEL-NEXT:    v_mov_b32_e32 v14, s26
+; GISEL-NEXT:    v_mov_b32_e32 v15, s27
+; GISEL-NEXT:    v_mov_b32_e32 v16, s28
+; GISEL-NEXT:    v_mov_b32_e32 v17, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[22:23]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[0:15], v[34:37], v[48:55], v16
+; GISEL-NEXT:    v_smfmac_i32_32x32x64_i8 v[12:27], v[28:31], v[0:7], v10
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x i32> %result
@@ -1718,94 +1262,46 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -1935,94 +1431,46 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -2152,94 +1600,46 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -2369,94 +1769,46 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
@@ -2600,217 +1952,85 @@ bb:
 }
 
 define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
 }
@@ -2823,7 +2043,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, v8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, v7
@@ -2834,12 +2053,12 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v20, v2
 ; SDAG-NEXT:    v_mov_b32_e32 v19, v1
 ; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s18
@@ -2848,79 +2067,73 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[28:31], v[0:7], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v18, v0
+; GISEL-NEXT:    v_mov_b32_e32 v19, v1
+; GISEL-NEXT:    v_mov_b32_e32 v20, v2
+; GISEL-NEXT:    v_mov_b32_e32 v21, v3
+; GISEL-NEXT:    v_mov_b32_e32 v22, v4
+; GISEL-NEXT:    v_mov_b32_e32 v23, v5
+; GISEL-NEXT:    v_mov_b32_e32 v24, v6
+; GISEL-NEXT:    v_mov_b32_e32 v25, v7
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT:    v_mov_b32_e32 v26, v8
+; GISEL-NEXT:    v_mov_b32_e32 v27, v9
+; GISEL-NEXT:    v_mov_b32_e32 v12, s24
+; GISEL-NEXT:    v_mov_b32_e32 v13, s25
+; GISEL-NEXT:    v_mov_b32_e32 v14, s26
+; GISEL-NEXT:    v_mov_b32_e32 v15, s27
+; GISEL-NEXT:    v_mov_b32_e32 v16, s28
+; GISEL-NEXT:    v_mov_b32_e32 v17, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[22:23]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[34:37], v[48:55], v16
+; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[28:31], v[0:7], v10
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -3014,217 +2227,85 @@ bb:
 }
 
 define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
 }
@@ -3237,7 +2318,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, v8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, v7
@@ -3248,12 +2328,12 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v20, v2
 ; SDAG-NEXT:    v_mov_b32_e32 v19, v1
 ; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s18
@@ -3262,79 +2342,73 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[28:31], v[0:7], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v18, v0
+; GISEL-NEXT:    v_mov_b32_e32 v19, v1
+; GISEL-NEXT:    v_mov_b32_e32 v20, v2
+; GISEL-NEXT:    v_mov_b32_e32 v21, v3
+; GISEL-NEXT:    v_mov_b32_e32 v22, v4
+; GISEL-NEXT:    v_mov_b32_e32 v23, v5
+; GISEL-NEXT:    v_mov_b32_e32 v24, v6
+; GISEL-NEXT:    v_mov_b32_e32 v25, v7
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT:    v_mov_b32_e32 v26, v8
+; GISEL-NEXT:    v_mov_b32_e32 v27, v9
+; GISEL-NEXT:    v_mov_b32_e32 v12, s24
+; GISEL-NEXT:    v_mov_b32_e32 v13, s25
+; GISEL-NEXT:    v_mov_b32_e32 v14, s26
+; GISEL-NEXT:    v_mov_b32_e32 v15, s27
+; GISEL-NEXT:    v_mov_b32_e32 v16, s28
+; GISEL-NEXT:    v_mov_b32_e32 v17, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[22:23]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[34:37], v[48:55], v16
+; GISEL-NEXT:    v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[28:31], v[0:7], v10
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -3428,217 +2502,85 @@ bb:
 }
 
 define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
 }
@@ -3651,7 +2593,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, v8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, v7
@@ -3662,12 +2603,12 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v20, v2
 ; SDAG-NEXT:    v_mov_b32_e32 v19, v1
 ; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s18
@@ -3676,79 +2617,73 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[28:31], v[0:7], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v18, v0
+; GISEL-NEXT:    v_mov_b32_e32 v19, v1
+; GISEL-NEXT:    v_mov_b32_e32 v20, v2
+; GISEL-NEXT:    v_mov_b32_e32 v21, v3
+; GISEL-NEXT:    v_mov_b32_e32 v22, v4
+; GISEL-NEXT:    v_mov_b32_e32 v23, v5
+; GISEL-NEXT:    v_mov_b32_e32 v24, v6
+; GISEL-NEXT:    v_mov_b32_e32 v25, v7
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT:    v_mov_b32_e32 v26, v8
+; GISEL-NEXT:    v_mov_b32_e32 v27, v9
+; GISEL-NEXT:    v_mov_b32_e32 v12, s24
+; GISEL-NEXT:    v_mov_b32_e32 v13, s25
+; GISEL-NEXT:    v_mov_b32_e32 v14, s26
+; GISEL-NEXT:    v_mov_b32_e32 v15, s27
+; GISEL-NEXT:    v_mov_b32_e32 v16, s28
+; GISEL-NEXT:    v_mov_b32_e32 v17, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[22:23]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[34:37], v[48:55], v16
+; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[28:31], v[0:7], v10
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
@@ -3842,217 +2777,85 @@ bb:
 }
 
 define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 3
+; GCN-NEXT:    v_mov_b32_e32 v0, v12
+; GCN-NEXT:    v_mov_b32_e32 v1, v13
+; GCN-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-NEXT:    v_mov_b32_e32 v4, v16
+; GCN-NEXT:    v_mov_b32_e32 v5, v17
+; GCN-NEXT:    v_mov_b32_e32 v6, v18
+; GCN-NEXT:    v_mov_b32_e32 v7, v19
+; GCN-NEXT:    v_mov_b32_e32 v8, v20
+; GCN-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NEXT:    v_mov_b32_e32 v10, v22
+; GCN-NEXT:    v_mov_b32_e32 v11, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v24
+; GCN-NEXT:    v_mov_b32_e32 v13, v25
+; GCN-NEXT:    v_mov_b32_e32 v14, v26
+; GCN-NEXT:    v_mov_b32_e32 v15, v27
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
 }
@@ -4065,7 +2868,6 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v29, s1
 ; SDAG-NEXT:    v_mov_b32_e32 v30, s2
 ; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v27, v9
 ; SDAG-NEXT:    v_mov_b32_e32 v26, v8
 ; SDAG-NEXT:    v_mov_b32_e32 v25, v7
@@ -4076,12 +2878,12 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v20, v2
 ; SDAG-NEXT:    v_mov_b32_e32 v19, v1
 ; SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; SDAG-NEXT:    v_mov_b32_e32 v12, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v13, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v14, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v15, s27
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s28
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
 ; SDAG-NEXT:    v_mov_b32_e32 v0, s16
 ; SDAG-NEXT:    v_mov_b32_e32 v1, s17
 ; SDAG-NEXT:    v_mov_b32_e32 v2, s18
@@ -4090,79 +2892,73 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 ; SDAG-NEXT:    v_mov_b32_e32 v5, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v6, s22
 ; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[28:31], v[0:7], v10
+; SDAG-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[28:31], v[0:7], v10
 ; SDAG-NEXT:    s_nop 7
 ; SDAG-NEXT:    s_nop 3
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT:    v_mov_b32_e32 v0, v12
+; SDAG-NEXT:    v_mov_b32_e32 v1, v13
+; SDAG-NEXT:    v_mov_b32_e32 v2, v14
+; SDAG-NEXT:    v_mov_b32_e32 v3, v15
+; SDAG-NEXT:    v_mov_b32_e32 v4, v16
+; SDAG-NEXT:    v_mov_b32_e32 v5, v17
+; SDAG-NEXT:    v_mov_b32_e32 v6, v18
+; SDAG-NEXT:    v_mov_b32_e32 v7, v19
+; SDAG-NEXT:    v_mov_b32_e32 v8, v20
+; SDAG-NEXT:    v_mov_b32_e32 v9, v21
+; SDAG-NEXT:    v_mov_b32_e32 v10, v22
+; SDAG-NEXT:    v_mov_b32_e32 v11, v23
+; SDAG-NEXT:    v_mov_b32_e32 v12, v24
+; SDAG-NEXT:    v_mov_b32_e32 v13, v25
+; SDAG-NEXT:    v_mov_b32_e32 v14, v26
+; SDAG-NEXT:    v_mov_b32_e32 v15, v27
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
+; GISEL-NEXT:    v_mov_b64_e32 v[30:31], s[2:3]
+; GISEL-NEXT:    v_mov_b64_e32 v[28:29], s[0:1]
+; GISEL-NEXT:    v_mov_b32_e32 v18, v0
+; GISEL-NEXT:    v_mov_b32_e32 v19, v1
+; GISEL-NEXT:    v_mov_b32_e32 v20, v2
+; GISEL-NEXT:    v_mov_b32_e32 v21, v3
+; GISEL-NEXT:    v_mov_b32_e32 v22, v4
+; GISEL-NEXT:    v_mov_b32_e32 v23, v5
+; GISEL-NEXT:    v_mov_b32_e32 v24, v6
+; GISEL-NEXT:    v_mov_b32_e32 v25, v7
+; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GISEL-NEXT:    v_mov_b32_e32 v26, v8
+; GISEL-NEXT:    v_mov_b32_e32 v27, v9
+; GISEL-NEXT:    v_mov_b32_e32 v12, s24
+; GISEL-NEXT:    v_mov_b32_e32 v13, s25
+; GISEL-NEXT:    v_mov_b32_e32 v14, s26
+; GISEL-NEXT:    v_mov_b32_e32 v15, s27
+; GISEL-NEXT:    v_mov_b32_e32 v16, s28
+; GISEL-NEXT:    v_mov_b32_e32 v17, s29
+; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[18:19]
+; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[20:21]
+; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[22:23]
 ; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[34:37], v[48:55], v16
+; GISEL-NEXT:    v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[28:31], v[0:7], v10
+; GISEL-NEXT:    s_nop 7
+; GISEL-NEXT:    s_nop 3
+; GISEL-NEXT:    v_mov_b32_e32 v0, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GISEL-NEXT:    v_mov_b32_e32 v4, v16
+; GISEL-NEXT:    v_mov_b32_e32 v5, v17
+; GISEL-NEXT:    v_mov_b32_e32 v6, v18
+; GISEL-NEXT:    v_mov_b32_e32 v7, v19
+; GISEL-NEXT:    v_mov_b32_e32 v8, v20
+; GISEL-NEXT:    v_mov_b32_e32 v9, v21
+; GISEL-NEXT:    v_mov_b32_e32 v10, v22
+; GISEL-NEXT:    v_mov_b32_e32 v11, v23
+; GISEL-NEXT:    v_mov_b32_e32 v12, v24
+; GISEL-NEXT:    v_mov_b32_e32 v13, v25
+; GISEL-NEXT:    v_mov_b32_e32 v14, v26
+; GISEL-NEXT:    v_mov_b32_e32 v15, v27
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index 3fffb04de1a63..7dd5366a35c8d 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -683,69 +683,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32:
 ; GREEDY90A:       ; %bb.0: ; %bb
 ; GREEDY90A-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GREEDY90A-NEXT:    v_mov_b32_e32 v16, 1.0
-; GREEDY90A-NEXT:    v_mov_b32_e32 v17, 2.0
+; GREEDY90A-NEXT:    v_mov_b32_e32 v0, 1.0
+; GREEDY90A-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GREEDY90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GREEDY90A-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
 ; GREEDY90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a31, s15
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a30, s14
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a29, s13
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a28, s12
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a27, s11
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a26, s10
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a25, s9
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a24, s8
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a23, s7
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a22, s6
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a21, s5
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a20, s4
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a19, s3
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a18, s2
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a17, s1
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a16, s0
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a33, s15
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a32, s14
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a31, s13
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a30, s12
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a29, s11
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a28, s10
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a27, s9
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a26, s8
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a25, s7
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a24, s6
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a23, s5
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a22, s4
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a21, s3
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a20, s2
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a19, s1
+; GREEDY90A-NEXT:    v_accvgpr_write_b32 a18, s0
 ; GREEDY90A-NEXT:    s_nop 1
-; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[16:31], v16, v17, a[16:31]
-; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v16, v17, a[16:31]
+; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
+; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
 ; GREEDY90A-NEXT:    s_nop 7
 ; GREEDY90A-NEXT:    s_nop 1
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v0, a16
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v1, a17
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v15, a13
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v14, a12
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v13, a11
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v12, a10
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v11, a9
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v10, a8
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v9, a7
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v8, a6
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v7, a5
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v6, a4
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v5, a3
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v4, a2
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v3, a1
-; GREEDY90A-NEXT:    v_accvgpr_read_b32 v2, a0
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a0, v0
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a1, v1
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a2, v2
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a3, v3
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a4, v4
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a5, v5
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a6, v6
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a7, v7
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a8, v8
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a9, v9
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a10, v10
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a11, v11
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a12, v12
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a13, v13
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a14, v14
-; GREEDY90A-NEXT:    v_accvgpr_write_b32 a15, v15
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a1, a19
+; GREEDY90A-NEXT:    v_accvgpr_mov_b32 a0, a18
+; GREEDY90A-NEXT:    s_nop 1
+; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
 ; GREEDY90A-NEXT:    v_mov_b32_e32 v0, 0
-; GREEDY90A-NEXT:    s_nop 0
-; GREEDY90A-NEXT:    v_mfma_f32_16x16x1f32 a[0:15], v16, v17, a[0:15]
 ; GREEDY90A-NEXT:    s_nop 7
-; GREEDY90A-NEXT:    s_nop 2
+; GREEDY90A-NEXT:    s_nop 1
 ; GREEDY90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GREEDY90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GREEDY90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -755,69 +725,39 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
 ; GREEDY942-LABEL: test_mfma_f32_16x16x1f32:
 ; GREEDY942:       ; %bb.0: ; %bb
 ; GREEDY942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GREEDY942-NEXT:    v_mov_b32_e32 v16, 1.0
-; GREEDY942-NEXT:    v_mov_b32_e32 v17, 2.0
+; GREEDY942-NEXT:    v_mov_b32_e32 v0, 1.0
+; GREEDY942-NEXT:    v_mov_b32_e32 v1, 2.0
 ; GREEDY942-NEXT:    s_waitcnt lgkmcnt(0)
 ; GREEDY942-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
 ; GREEDY942-NEXT:    s_waitcnt lgkmcnt(0)
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a31, s15
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a30, s14
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a29, s13
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a28, s12
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a27, s11
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a26, s10
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a25, s9
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a24, s8
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a23, s7
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a22, s6
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a21, s5
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a20, s4
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a19, s3
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a18, s2
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a17, s1
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a16, s0
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a33, s15
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a32, s14
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a31, s13
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a30, s12
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a29, s11
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a28, s10
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a27, s9
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a26, s8
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a25, s7
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a24, s6
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a23, s5
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a22, s4
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a21, s3
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a20, s2
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a19, s1
+; GREEDY942-NEXT:    v_accvgpr_write_b32 a18, s0
 ; GREEDY942-NEXT:    s_nop 1
-; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[16:31], v16, v17, a[16:31]
-; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v16, v17, a[16:31]
+; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33]
+; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33]
 ; GREEDY942-NEXT:    s_nop 7
 ; GREEDY942-NEXT:    s_nop 0
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v0, a16
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v1, a17
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v15, a13
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v14, a12
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v13, a11
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v12, a10
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v11, a9
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v10, a8
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v9, a7
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v8, a6
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v7, a5
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v6, a4
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v5, a3
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v4, a2
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v3, a1
-; GREEDY942-NEXT:    v_accvgpr_read_b32 v2, a0
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a0, v0
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a1, v1
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a2, v2
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a3, v3
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a4, v4
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a5, v5
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a6, v6
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a7, v7
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a8, v8
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a9, v9
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a10, v10
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a11, v11
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a12, v12
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a13, v13
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a14, v14
-; GREEDY942-NEXT:    v_accvgpr_write_b32 a15, v15
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a1, a19
+; GREEDY942-NEXT:    v_accvgpr_mov_b32 a0, a18
+; GREEDY942-NEXT:    s_nop 1
+; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
 ; GREEDY942-NEXT:    v_mov_b32_e32 v0, 0
-; GREEDY942-NEXT:    s_nop 0
-; GREEDY942-NEXT:    v_mfma_f32_16x16x1_4b_f32 a[0:15], v16, v17, a[0:15]
 ; GREEDY942-NEXT:    s_nop 7
-; GREEDY942-NEXT:    s_nop 1
+; GREEDY942-NEXT:    s_nop 0
 ; GREEDY942-NEXT:    global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
 ; GREEDY942-NEXT:    global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
 ; GREEDY942-NEXT:    global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
@@ -1058,11 +998,13 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
 ; FAST90A-NEXT:    v_accvgpr_write_b32 a2, s6
 ; FAST90A-NEXT:    v_accvgpr_write_b32 a3, s7
 ; FAST90A-NEXT:    s_nop 1
-; FAST90A-NEXT:    v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3]
-; FAST90A-NEXT:    v_mfma_f32_4x4x1f32 a[4:7], v1, v2, a[0:3]
-; FAST90A-NEXT:    s_nop 4
-; FAST90A-NEXT:    v_accvgpr_mov_b32 a2, a4
+; FAST90A-NEXT:    v_mfma_f32_4x4x1f32 a[8:11], v1, v2, a[0:3]
+; FAST90A-NEXT:    v_mfma_f32_4x4x1f32 a[4:7], v1, v2, a[8:11]
+; FAST90A-NEXT:    s_nop 3
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a1, a9
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a0, a8
 ; FAST90A-NEXT:    v_accvgpr_mov_b32 a3, a5
+; FAST90A-NEXT:    v_accvgpr_mov_b32 a2, a4
 ; FAST90A-NEXT:    s_nop 1
 ; FAST90A-NEXT:    v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3]
 ; FAST90A-NEXT:    s_nop 4
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index 3844d6054e130..6eca04277585e 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -15,27 +15,27 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
-; GFX942-NEXT:    s_mov_b32 s3, 0
+; GFX942-NEXT:    s_mov_b32 s6, 0
 ; GFX942-NEXT:    s_branch .LBB0_2
 ; GFX942-NEXT:  .LBB0_1: ; %bb2
 ; GFX942-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX942-NEXT:    s_or_b32 s4, s3, 1
-; GFX942-NEXT:    s_ashr_i32 s5, s3, 31
 ; GFX942-NEXT:    s_mov_b32 s3, s2
-; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    s_nop 0
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a0, a2
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a2, a1
 ; GFX942-NEXT:    v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT:    s_and_b32 s3, s5, s4
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3]
+; GFX942-NEXT:    s_or_b32 s4, s6, 1
+; GFX942-NEXT:    s_ashr_i32 s3, s6, 31
+; GFX942-NEXT:    v_mfma_f32_16x16x16_f16 a[2:5], v[0:1], v[0:1], a[0:3]
+; GFX942-NEXT:    s_and_b32 s6, s3, s4
 ; GFX942-NEXT:    s_cbranch_execz .LBB0_4
 ; GFX942-NEXT:  .LBB0_2: ; %bb
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX942-NEXT:    s_cbranch_vccz .LBB0_1
 ; GFX942-NEXT:  ; %bb.3:
-; GFX942-NEXT:    ; implicit-def: $sgpr3
+; GFX942-NEXT:    ; implicit-def: $sgpr6
 ; GFX942-NEXT:    ; implicit-def: $agpr2
 ; GFX942-NEXT:  .LBB0_4: ; %common.ret
 ; GFX942-NEXT:    s_endpgm
@@ -52,31 +52,31 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
 ; GFX908-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX908-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v0
-; GFX908-NEXT:    s_mov_b32 s3, 0
+; GFX908-NEXT:    s_mov_b32 s6, 0
 ; GFX908-NEXT:    s_branch .LBB0_2
 ; GFX908-NEXT:  .LBB0_1: ; %bb2
 ; GFX908-NEXT:    ; in Loop: Header=BB0_2 Depth=1
-; GFX908-NEXT:    s_or_b32 s4, s3, 1
-; GFX908-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
 ; GFX908-NEXT:    s_mov_b32 s3, s2
-; GFX908-NEXT:    v_mov_b32_e32 v1, s2
-; GFX908-NEXT:    s_nop 2
-; GFX908-NEXT:    v_accvgpr_read_b32 v0, a2
-; GFX908-NEXT:    v_mov_b32_e32 v2, s3
+; GFX908-NEXT:    v_mov_b32_e32 v0, s2
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX908-NEXT:    v_accvgpr_read_b32 v4, a1
-; GFX908-NEXT:    v_accvgpr_read_b32 v3, a1
-; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a1
+; GFX908-NEXT:    s_or_b32 s4, s6, 1
 ; GFX908-NEXT:    v_accvgpr_write_b32 a2, v4
-; GFX908-NEXT:    v_accvgpr_write_b32 a3, v3
-; GFX908-NEXT:    s_and_b32 s3, s5, s4
-; GFX908-NEXT:    v_mfma_f32_16x16x16f16 a[2:5], v[1:2], v[1:2], a[0:3]
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v2
+; GFX908-NEXT:    s_ashr_i32 s3, s6, 31
+; GFX908-NEXT:    v_mfma_f32_16x16x16f16 a[2:5], v[0:1], v[0:1], a[0:3]
+; GFX908-NEXT:    s_and_b32 s6, s3, s4
 ; GFX908-NEXT:    s_cbranch_execz .LBB0_4
 ; GFX908-NEXT:  .LBB0_2: ; %bb
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB0_1
 ; GFX908-NEXT:  ; %bb.3:
-; GFX908-NEXT:    ; implicit-def: $sgpr3
+; GFX908-NEXT:    ; implicit-def: $sgpr6
 ; GFX908-NEXT:    ; implicit-def: $agpr2
 ; GFX908-NEXT:  .LBB0_4: ; %common.ret
 ; GFX908-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
index ee5481617cf59..eb1122b6b62f8 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir
@@ -89,7 +89,7 @@ body:             |
   ; COALESCE-NEXT: bb.1:
   ; COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; COALESCE-NEXT: {{  $}}
-  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
   ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
   ; COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
   ; COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
@@ -102,11 +102,10 @@ body:             |
   ; COALESCE-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
   ; COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
   ; COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
-  ; COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
+  ; COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
   ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
   ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
-  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
   ; COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; COALESCE-NEXT: {{  $}}
   ; COALESCE-NEXT: bb.3:
@@ -143,7 +142,7 @@ body:             |
   ; GFX908-COALESCE-NEXT: bb.1:
   ; GFX908-COALESCE-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX908-COALESCE-NEXT: {{  $}}
-  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1
   ; GFX908-COALESCE-NEXT:   $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc
   ; GFX908-COALESCE-NEXT:   S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc
@@ -156,11 +155,10 @@ body:             |
   ; GFX908-COALESCE-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32 = S_ASHR_I32 [[S_MOV_B32_1]], 31, implicit-def dead $scc
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0
-  ; GFX908-COALESCE-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]]
+  ; GFX908-COALESCE-NEXT:   [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]]
   ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
   ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1
-  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
+  ; GFX908-COALESCE-NEXT:   [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec
   ; GFX908-COALESCE-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0
   ; GFX908-COALESCE-NEXT: {{  $}}
   ; GFX908-COALESCE-NEXT: bb.3:

>From 4e7bf6c242ebb2b348a2c178f439f2ffdac5d252 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 4 Jun 2025 10:35:06 -0700
Subject: [PATCH 3/3] Fix comment typo

Change-Id: Iac1f38523e51f8abb796ed1f86b95132d86679d4
---
 llvm/include/llvm/CodeGen/MachineRegisterInfo.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 5f868efde8d9c..a20e9f98dd410 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -729,8 +729,9 @@ class MachineRegisterInfo {
   bool constrainRegAttrs(Register Reg, Register ConstrainingReg,
                          unsigned MinNumRegs = 0);
 
-  /// recomputeRegClass - Try to find a legal super-class of Reg's register
-  /// class that still satisfies the constraints from the instructions using
+  /// getLargestConstrainedSuperClass - Try to find a legal super-class of Reg's
+  /// register class that still satisfies the constraints from the instructions
+  /// using
   /// \p Reg. \p return the super-class TargetRegisterClass if one was found,
   /// otherwise \p return the original TargetRegisterClass.
   const TargetRegisterClass *



More information about the llvm-commits mailing list